├── .gitignore
├── LICENSE
├── README.md
├── asset
    └── teaser.jpg
├── config
    ├── compositional_image
    │   └── rbf.yaml
    └── quantity_aware
    │   └── rbf.yaml
├── main.py
├── rbf
    ├── corrector
    │   ├── __init__.py
    │   ├── adaptive_sampler.py
    │   ├── base.py
    │   ├── dps.py
    │   ├── reward_model
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── counting.py
    │   │   ├── human.py
    │   │   ├── imagereward.py
    │   │   ├── pickscore.py
    │   │   ├── stylereward.py
    │   │   ├── vlm.py
    │   │   ├── vqa.py
    │   │   └── vqa_server.py
    │   └── rgrp_sampler.py
    ├── logger
    │   ├── __init__.py
    │   └── logger.py
    ├── model
    │   ├── __init__.py
    │   ├── base.py
    │   ├── image.py
    │   └── image_flux.py
    ├── prior
    │   ├── __init__.py
    │   ├── base.py
    │   ├── denoise_schedulers
    │   │   ├── __init__.py
    │   │   └── scheduler.py
    │   ├── flux.py
    │   ├── flux_fill.py
    │   ├── instaflow.py
    │   ├── sd.py
    │   └── sd2.py
    ├── rbf.py
    ├── shared_modules.py
    ├── time_sampler
    │   ├── __init__.py
    │   └── base.py
    └── utils
    │   ├── camera_utils.py
    │   ├── config_utils.py
    │   ├── extra_utils.py
    │   ├── fs_travel_utils.py
    │   ├── image_utils.py
    │   ├── path_utils.py
    │   ├── print_utils.py
    │   └── random_utils.py
├── requirements.txt
├── setup.py
└── third-party
    └── t2v_metrics
        ├── .gitignore
        ├── LICENSE
        ├── README.md
        ├── dataset.py
        ├── datasets
            ├── SeeTRUE.csv
            ├── dsg_tifa160_anns.csv
            ├── stanfordt23d.json
            ├── sugar_crepe
            │   ├── add_att.json
            │   ├── add_obj.json
            │   ├── replace_att.json
            │   ├── replace_obj.json
            │   ├── replace_rel.json
            │   ├── swap_att.json
            │   └── swap_obj.json
            ├── t2vscore_alignment_score.json
            ├── t2vscore_quality_score.json
            ├── t2vscore_results.csv
            └── tifa160.json
        ├── eval.py
        ├── genai_bench
            ├── evaluate.py
            ├── generate.py
            └── model_performance_vqacore.md
        ├── genai_image_eval.py
        ├── genai_image_ranking.py
        ├── genai_video_eval.py
        ├── gpt4_eval.py
        ├── images
            ├── 0
            │   ├── DALLE3.png
            │   ├── DeepFloyd.jpg
            │   ├── Midjourney.jpg
            │   └── SDXL.jpg
            ├── 1
            │   ├── DALLE3.png
            │   ├── DeepFloyd.jpg
            │   ├── Midjourney.jpg
            │   └── SDXL.jpg
            ├── 0.png
            ├── 1.png
            └── example.png
        ├── pyproject.toml
        ├── requirements.txt
        ├── t2v_metrics
            ├── __init__.py
            ├── clipscore.py
            ├── constants.py
            ├── itmscore.py
            ├── models
            │   ├── __init__.py
            │   ├── clipscore_models
            │   │   ├── __init__.py
            │   │   ├── blip2_itc_model.py
            │   │   ├── clip_model.py
            │   │   ├── hpsv2_model.py
            │   │   └── pickscore_model.py
            │   ├── itmscore_models
            │   │   ├── __init__.py
            │   │   ├── blip2_itm_model.py
            │   │   └── image_reward_model.py
            │   ├── model.py
            │   └── vqascore_models
            │   │   ├── __init__.py
            │   │   ├── clip_t5
            │   │       ├── __init__.py
            │   │       └── model
            │   │       │   ├── __init__.py
            │   │       │   ├── language_model
            │   │       │       └── clip_t5.py
            │   │       │   ├── multimodal_encoder
            │   │       │       ├── builder.py
            │   │       │       └── clip_encoder.py
            │   │       │   └── multimodal_projector
            │   │       │       └── builder.py
            │   │   ├── clip_t5_model.py
            │   │   ├── gpt4v_model.py
            │   │   ├── instructblip_model.py
            │   │   ├── lavis
            │   │       ├── __init__.py
            │   │       ├── common
            │   │       │   ├── config.py
            │   │       │   ├── dist_utils.py
            │   │       │   ├── gradcam.py
            │   │       │   ├── logger.py
            │   │       │   ├── optims.py
            │   │       │   ├── registry.py
            │   │       │   ├── utils.py
            │   │       │   └── vqa_tools
            │   │       │   │   ├── __init__.py
            │   │       │   │   ├── vqa.py
            │   │       │   │   └── vqa_eval.py
            │   │       ├── configs
            │   │       │   ├── datasets
            │   │       │   │   ├── aokvqa
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── avsd
            │   │       │   │   │   └── defaults_dial.yaml
            │   │       │   │   ├── coco
            │   │       │   │   │   ├── defaults_cap.yaml
            │   │       │   │   │   ├── defaults_ret.yaml
            │   │       │   │   │   ├── defaults_vqa.yaml
            │   │       │   │   │   └── eval_vqa.yaml
            │   │       │   │   ├── conceptual_caption
            │   │       │   │   │   ├── defaults_12m.yaml
            │   │       │   │   │   └── defaults_3m.yaml
            │   │       │   │   ├── didemo
            │   │       │   │   │   └── defaults_ret.yaml
            │   │       │   │   ├── flickr30k
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── gqa
            │   │       │   │   │   ├── balanced_testdev.yaml
            │   │       │   │   │   ├── balanced_val.yaml
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── imagenet
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── laion
            │   │       │   │   │   └── defaults_2B_multi.yaml
            │   │       │   │   ├── msrvtt
            │   │       │   │   │   ├── defaults_cap.yaml
            │   │       │   │   │   ├── defaults_qa.yaml
            │   │       │   │   │   └── defaults_ret.yaml
            │   │       │   │   ├── msvd
            │   │       │   │   │   ├── defaults_cap.yaml
            │   │       │   │   │   └── defaults_qa.yaml
            │   │       │   │   ├── nlvr
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── nocaps
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── okvqa
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── sbu_caption
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── snli_ve
            │   │       │   │   │   └── defaults.yaml
            │   │       │   │   ├── vatex
            │   │       │   │   │   └── defaults_cap.yaml
            │   │       │   │   └── vg
            │   │       │   │   │   ├── defaults_caption.yaml
            │   │       │   │   │   └── defaults_vqa.yaml
            │   │       │   ├── default.yaml
            │   │       │   └── models
            │   │       │   │   ├── albef_classification_ve.yaml
            │   │       │   │   ├── albef_feature_extractor.yaml
            │   │       │   │   ├── albef_nlvr.yaml
            │   │       │   │   ├── albef_pretrain_base.yaml
            │   │       │   │   ├── albef_retrieval_coco.yaml
            │   │       │   │   ├── albef_retrieval_flickr.yaml
            │   │       │   │   ├── albef_vqav2.yaml
            │   │       │   │   ├── alpro_qa_msrvtt.yaml
            │   │       │   │   ├── alpro_qa_msvd.yaml
            │   │       │   │   ├── alpro_retrieval_didemo.yaml
            │   │       │   │   ├── alpro_retrieval_msrvtt.yaml
            │   │       │   │   ├── bert_config.json
            │   │       │   │   ├── bert_config_alpro.json
            │   │       │   │   ├── blip2
            │   │       │   │       ├── blip2_caption_flant5xl.yaml
            │   │       │   │       ├── blip2_caption_opt2.7b.yaml
            │   │       │   │       ├── blip2_caption_opt6.7b.yaml
            │   │       │   │       ├── blip2_coco.yaml
            │   │       │   │       ├── blip2_instruct_flant5xl.yaml
            │   │       │   │       ├── blip2_instruct_flant5xxl.yaml
            │   │       │   │       ├── blip2_instruct_vicuna13b.yaml
            │   │       │   │       ├── blip2_instruct_vicuna7b.yaml
            │   │       │   │       ├── blip2_pretrain.yaml
            │   │       │   │       ├── blip2_pretrain_flant5xl.yaml
            │   │       │   │       ├── blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml
            │   │       │   │       ├── blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml
            │   │       │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
            │   │       │   │       ├── blip2_pretrain_flant5xxl.yaml
            │   │       │   │       ├── blip2_pretrain_opt2.7b.yaml
            │   │       │   │       ├── blip2_pretrain_opt6.7b.yaml
            │   │       │   │       ├── blip2_pretrain_vitL.yaml
            │   │       │   │       ├── blip2_vicuna13b.yaml
            │   │       │   │       └── blip2_vicuna7b.yaml
            │   │       │   │   ├── blip_caption_base_coco.yaml
            │   │       │   │   ├── blip_caption_large_coco.yaml
            │   │       │   │   ├── blip_classification_base.yaml
            │   │       │   │   ├── blip_feature_extractor_base.yaml
            │   │       │   │   ├── blip_itm_base.yaml
            │   │       │   │   ├── blip_itm_large.yaml
            │   │       │   │   ├── blip_nlvr.yaml
            │   │       │   │   ├── blip_pretrain_base.yaml
            │   │       │   │   ├── blip_pretrain_large.yaml
            │   │       │   │   ├── blip_retrieval_coco.yaml
            │   │       │   │   ├── blip_retrieval_flickr.yaml
            │   │       │   │   ├── blip_vqa_aokvqa.yaml
            │   │       │   │   ├── blip_vqa_okvqa.yaml
            │   │       │   │   ├── blip_vqav2.yaml
            │   │       │   │   ├── clip
            │   │       │   │       ├── RN101-quickgelu.json
            │   │       │   │       ├── RN101.json
            │   │       │   │       ├── RN50-quickgelu.json
            │   │       │   │       ├── RN50.json
            │   │       │   │       ├── RN50x16.json
            │   │       │   │       ├── RN50x4.json
            │   │       │   │       ├── ViT-B-16-plus-240.json
            │   │       │   │       ├── ViT-B-16-plus.json
            │   │       │   │       ├── ViT-B-16.json
            │   │       │   │       ├── ViT-B-32-plus-256.json
            │   │       │   │       ├── ViT-B-32-quickgelu.json
            │   │       │   │       ├── ViT-B-32.json
            │   │       │   │       ├── ViT-H-14.json
            │   │       │   │       ├── ViT-H-16.json
            │   │       │   │       ├── ViT-L-14-280.json
            │   │       │   │       ├── ViT-L-14-336.json
            │   │       │   │       ├── ViT-L-14.json
            │   │       │   │       ├── ViT-L-16-320.json
            │   │       │   │       ├── ViT-L-16.json
            │   │       │   │       ├── ViT-g-14.json
            │   │       │   │       ├── timm-efficientnetv2_rw_s.json
            │   │       │   │       ├── timm-resnet50d.json
            │   │       │   │       ├── timm-resnetaa50d.json
            │   │       │   │       ├── timm-resnetblur50.json
            │   │       │   │       ├── timm-swin_base_patch4_window7_224.json
            │   │       │   │       ├── timm-vit_base_patch16_224.json
            │   │       │   │       ├── timm-vit_base_patch32_224.json
            │   │       │   │       └── timm-vit_small_patch16_224.json
            │   │       │   │   ├── clip_resnet50.yaml
            │   │       │   │   ├── clip_vit_base16.yaml
            │   │       │   │   ├── clip_vit_base32.yaml
            │   │       │   │   ├── clip_vit_large14.yaml
            │   │       │   │   ├── clip_vit_large14_336.yaml
            │   │       │   │   ├── gpt_dialogue_base.yaml
            │   │       │   │   ├── img2prompt-vqa
            │   │       │   │       └── img2prompt_vqa_base.yaml
            │   │       │   │   ├── med_config.json
            │   │       │   │   ├── med_config_albef.json
            │   │       │   │   ├── med_large_config.json
            │   │       │   │   └── pnp-vqa
            │   │       │   │       ├── pnp_vqa_3b.yaml
            │   │       │   │       ├── pnp_vqa_base.yaml
            │   │       │   │       ├── pnp_vqa_large.yaml
            │   │       │   │       ├── unifiedqav2_3b_config.json
            │   │       │   │       ├── unifiedqav2_base_config.json
            │   │       │   │       └── unifiedqav2_large_config.json
            │   │       ├── models
            │   │       │   ├── __init__.py
            │   │       │   ├── albef_models
            │   │       │   │   ├── __init__.py
            │   │       │   │   ├── albef_classification.py
            │   │       │   │   ├── albef_feature_extractor.py
            │   │       │   │   ├── albef_nlvr.py
            │   │       │   │   ├── albef_outputs.py
            │   │       │   │   ├── albef_pretrain.py
            │   │       │   │   ├── albef_retrieval.py
            │   │       │   │   └── albef_vqa.py
            │   │       │   ├── base_model.py
            │   │       │   ├── blip2_models
            │   │       │   │   ├── Qformer.py
            │   │       │   │   ├── __init__.py
            │   │       │   │   ├── blip2.py
            │   │       │   │   ├── blip2_image_text_matching.py
            │   │       │   │   ├── blip2_qformer.py
            │   │       │   │   ├── blip2_t5.py
            │   │       │   │   ├── blip2_t5_instruct.py
            │   │       │   │   ├── blip2_vicuna.py
            │   │       │   │   ├── blip2_vicuna_instruct.py
            │   │       │   │   ├── modeling_llama.py
            │   │       │   │   └── modeling_t5.py
            │   │       │   ├── blip_models
            │   │       │   │   ├── __init__.py
            │   │       │   │   ├── blip.py
            │   │       │   │   ├── blip_caption.py
            │   │       │   │   ├── blip_classification.py
            │   │       │   │   ├── blip_feature_extractor.py
            │   │       │   │   ├── blip_image_text_matching.py
            │   │       │   │   ├── blip_nlvr.py
            │   │       │   │   ├── blip_outputs.py
            │   │       │   │   ├── blip_pretrain.py
            │   │       │   │   ├── blip_retrieval.py
            │   │       │   │   ├── blip_vqa.py
            │   │       │   │   └── nlvr_encoder.py
            │   │       │   ├── clip_vit.py
            │   │       │   ├── eva_vit.py
            │   │       │   ├── med.py
            │   │       │   └── vit.py
            │   │       └── processors
            │   │       │   ├── __init__.py
            │   │       │   ├── base_processor.py
            │   │       │   ├── blip_processors.py
            │   │       │   └── randaugment.py
            │   │   ├── llava
            │   │       ├── __init__.py
            │   │       └── model
            │   │       │   ├── __init__.py
            │   │       │   ├── language_model
            │   │       │       └── llava_llama.py
            │   │       │   ├── llava_arch.py
            │   │       │   ├── multimodal_encoder
            │   │       │       ├── builder.py
            │   │       │       └── clip_encoder.py
            │   │       │   └── multimodal_projector
            │   │       │       └── builder.py
            │   │   ├── llava16_model.py
            │   │   ├── llava_16
            │   │       ├── __init__.py
            │   │       └── model
            │   │       │   ├── __init__.py
            │   │       │   ├── language_model
            │   │       │       └── llava_llama.py
            │   │       │   ├── llava_arch.py
            │   │       │   ├── multimodal_encoder
            │   │       │       ├── builder.py
            │   │       │       └── clip_encoder.py
            │   │       │   └── multimodal_projector
            │   │       │       └── builder.py
            │   │   ├── llava_model.py
            │   │   ├── mm_utils.py
            │   │   └── vqa_model.py
            ├── score.py
            └── vqascore.py
        └── tau_optimization.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 KAIST Visual AI Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/asset/teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/asset/teaser.jpg


--------------------------------------------------------------------------------
/config/compositional_image/rbf.yaml:
--------------------------------------------------------------------------------
 1 | batch_size: 2
 2 | init_n_particles: 25
 3 | 
 4 | convert_scheduler: vp 
 5 | sample_method: sde
 6 | text_prompt: 
 7 | 
 8 | #===============================================================================
 9 | 
10 | max_nfe: 500
11 | max_steps: 10
12 | block_size: 1
13 | n_particles: 1
14 | scheduler_n: 1.0
15 | 
16 | #===============================================================================
17 | 
18 | root_dir: ./results
19 | tag: ${filtering_method}
20 | save_now: True
21 | 
22 | device: 0
23 | seed: 0
24 | 
25 | filtering_method: rbf
26 | 
27 | #===============================================================================
28 | # Trainer settings 
29 | #===============================================================================
30 | disable_debug: False
31 | 
32 | #===============================================================================
33 | # Dataset 
34 | #===============================================================================
35 | height: 1024
36 | width: 1024
37 | 
38 | #===============================================================================
39 | # Time sampler 
40 | #===============================================================================
41 | time_sampler: flux_scheduler
42 | time_schedule: exp
43 | t_max: 1000
44 | 
45 | #===============================================================================
46 | # Model 
47 | #===============================================================================
48 | model: flux_image
49 | channels: 4096 # [B 4096 64]
50 | 
51 | #===============================================================================
52 | # Prior
53 | #===============================================================================
54 | prior: flux
55 | guidance_scale: 3.5
56 | 
57 | diffusion_coefficient: square
58 | diffusion_norm: 3.0
59 | 
60 | # Only used for "exp" diffusion coefficient
61 | exp_diff_coeff_sigma: 0.1
62 | 
63 | model_name: "black-forest-labs/FLUX.1-schnell"
64 | 
65 | #===============================================================================
66 | # Logger
67 | #===============================================================================
68 | logger: self
69 | log_interval: 1
70 | 
71 | #===============================================================================
72 | # Corrector
73 | #===============================================================================
74 | corrector: particle
75 | 
76 | reward_weight: 0.5
77 | reward_score: vqa
78 | 
79 | vqa_model: clip-flant5-xxl
80 | vqa_batch_size: 32
81 | 
82 | vqa_device: 1
83 | image_reward_weight: 0.0
84 | 


--------------------------------------------------------------------------------
/config/quantity_aware/rbf.yaml:
--------------------------------------------------------------------------------
 1 | max_nfe: 500
 2 | batch_size: 2
 3 | tau_norm: 0.0
 4 | 
 5 | text_prompt: Six airplanes flying over a desert with seven camels walking below
 6 | class_gt_counts: 6, 7
 7 | class_names: airplanes, camels
 8 | 
 9 | max_steps: 10
10 | 
11 | block_size: 1
12 | n_particles: 1
13 | sample_method: sde
14 | convert_scheduler: vp
15 | scheduler_n: 1
16 | 
17 | init_n_particles: 25
18 | 
19 | 
20 | #===============================================================================
21 | 
22 | root_dir: ./results
23 | tag: ${filtering_method}
24 | save_now: True
25 | 
26 | device: 0
27 | seed: 0
28 | 
29 | filtering_method: rbf
30 | 
31 | #===============================================================================
32 | # Trainer settings 
33 | #===============================================================================
34 | disable_debug: False
35 | 
36 | #===============================================================================
37 | # Dataset 
38 | #===============================================================================
39 | height: 1024
40 | width: 1024
41 | 
42 | #===============================================================================
43 | # Time sampler 
44 | #===============================================================================
45 | time_sampler: flux_scheduler
46 | time_schedule: exp
47 | t_max: 1000
48 | 
49 | #===============================================================================
50 | # Model 
51 | #===============================================================================
52 | model: flux_image
53 | channels: 4096 # [B 4096 64]
54 | 
55 | #===============================================================================
56 | # Prior
57 | #===============================================================================
58 | prior: flux
59 | guidance_scale: 3.5
60 | 
61 | diffusion_coefficient: square
62 | diffusion_norm: 3.0
63 | 
64 | model_name: "black-forest-labs/FLUX.1-schnell"
65 | 
66 | #===============================================================================
67 | # Logger
68 | #===============================================================================
69 | logger: self
70 | log_interval: 1
71 | 
72 | #===============================================================================
73 | # Corrector
74 | #===============================================================================
75 | corrector: particle
76 | 
77 | reward_score: counting
78 | reward_func: diff
79 | count_reward_model: gdsam


--------------------------------------------------------------------------------
/rbf/corrector/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import DDIMCorrector
 2 | from .rgrp_sampler import RGRPSampler
 3 | from .adaptive_sampler import AdaptiveSampler
 4 | from .dps import DiffRewardCorrector
 5 | 
 6 | CORRECTORs = {
 7 |     "ddim": DDIMCorrector,
 8 |     "particle": RGRPSampler,
 9 |     "adaptive": AdaptiveSampler,
10 |     "diff": DiffRewardCorrector,
11 | }
12 | 
13 | CORRECTOR_REQUIRING_GRADIENT = ["diff"]


--------------------------------------------------------------------------------
/rbf/corrector/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import dataclass
 3 | from random import randint
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from rbf.utils.extra_utils import ignore_kwargs
 9 | from rbf.utils.print_utils import print_warning, print_error, print_info
10 | from rbf import shared_modules as sm
11 | 
12 | 
13 | class Corrector(ABC):
14 |     @ignore_kwargs
15 |     @dataclass
16 |     class Config:
17 |         correct_steps: int = 1
18 | 
19 | 
20 |     def __init__(self, cfg):
21 |         self.cfg = self.Config(**cfg)
22 |         self.potentials = []
23 | 
24 | 
25 |     @abstractmethod
26 |     def pre_correct(self, images):
27 |         # Correct samples
28 |         pass 
29 | 
30 | 
31 |     @abstractmethod
32 |     def post_correct(self, images):
33 |         # Correct samples
34 |         pass 
35 | 
36 | 
37 | class DDIMCorrector(Corrector):
38 |     def pre_correct(
39 |         self, 
40 |         noisy_sample, 
41 |         tweedie, 
42 |         model_pred, 
43 |         step=None
44 |     ):
45 |         return noisy_sample, model_pred
46 | 
47 |     def post_correct(
48 |         self, 
49 |         prev_noisy_sample, 
50 |         tweedie,
51 |         model_pred,
52 |         step, 
53 |     ):
54 |         return prev_noisy_sample, tweedie, model_pred
55 | 
56 |     def final_correct(
57 |         self, 
58 |         noisy_sample, 
59 |         tweedie, 
60 |         step=None
61 |     ):
62 |         return noisy_sample, tweedie
63 |     
64 | 


--------------------------------------------------------------------------------
/rbf/corrector/dps.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from dataclasses import dataclass
 3 | from random import randint
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from rbf.utils.extra_utils import ignore_kwargs
 9 | from rbf.utils.print_utils import print_warning, print_error, print_info
10 | from rbf import shared_modules as sm
11 | from rbf.corrector.base import Corrector
12 | from rbf.corrector.rgrp_sampler import RGRPSampler
13 | 
14 | 
15 | class DiffRewardCorrector(RGRPSampler):
16 |     @ignore_kwargs
17 |     @dataclass
18 |     class Config(RGRPSampler.Config):
19 |         strength: float = 1.0
20 |         device: int = 0
21 |         batch_size: int = 1
22 |         n_particles: int = 1
23 | 
24 |         reward_score: str = 'style'
25 |         guidance_method: str = 'dps'
26 | 
27 |         disable_debug: bool = False
28 |         log_interval: int = 5
29 | 
30 | 
31 |     def __init__(self, cfg):
32 |         super().__init__(cfg)
33 |         self.cfg = self.Config(**cfg)
34 | 
35 | 
36 |     def adjust_sample_size(self, t_curr, step):
37 |         return self.cfg.n_particles
38 | 
39 |     
40 |     def apply_guidance(
41 |         self, 
42 |         noisy_sample, 
43 |         tweedie, 
44 |         step,
45 |     ):
46 |         
47 |         if self.cfg.guidance_method == "dps":
48 |             weight = self.reward_model(tweedie, step)
49 | 
50 |             # NOTE: Weight is computed as -loss. 
51 |             # Applying guidance with +gradient
52 | 
53 |             grad = torch.autograd.grad(weight.sum(), noisy_sample)[0]
54 | 
55 |             prev_latent_noisy = prev_latent_noisy + (self.cfg.strength * grad) / torch.abs(weight.view(-1, * ([1] * (len(prev_latent_noisy.shape) - 1)) ))
56 |         
57 |         else:
58 |             raise NotImplementedError(f"Guidance {self.cfg.guidance} not implemented")
59 | 
60 |         return prev_latent_noisy
61 | 
62 | 
63 |     def post_correct(
64 |         self, 
65 |         noisy_sample,
66 |         tweedie, 
67 |         model_pred,
68 |         step,
69 |     ):
70 | 
71 |         rgb_tweedie = sm.prior.decode_latent(
72 |             tweedie, convert_to_float=False
73 |         )
74 | 
75 |         # Apply guidance (DPS/FreeDoM)
76 |         prev_latent_noisy = self.apply_guidance(
77 |             noisy_sample, 
78 |             rgb_tweedie,
79 |             step,
80 |         )
81 | 
82 |         (
83 |             resample_noisy_sample, 
84 |             resample_tweedie, 
85 |             resample_model_pred
86 |         ) = super().post_correct(
87 |             prev_latent_noisy, 
88 |             tweedie, 
89 |             model_pred,
90 |         )
91 | 
92 |         return (resample_noisy_sample, resample_tweedie, resample_model_pred) # B x D


--------------------------------------------------------------------------------
/rbf/corrector/reward_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import AestheticRewardModel, CompressionRewardModel, InpaintingRewardModel
2 | from .counting import CountingRewardModel
3 | from .vlm import VLMRewardModel
4 | from .human import HumanRewardModel
5 | from .pickscore import PickScoreRewardModel
6 | from .vqa import VQARewardModel
7 | from .imagereward import ImageRewardRewardModel


--------------------------------------------------------------------------------
/rbf/corrector/reward_model/vqa_server.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.managers import BaseManager
 2 | import torch
 3 | import argparse
 4 | import time
 5 | import t2v_metrics
 6 | import ImageReward as RM
 7 | 
 8 | class RemoteVQAManager(BaseManager):
 9 |     pass
10 | 
11 | @torch.no_grad()
12 | def process_VQA(input):
13 |     start_time = time.time();
14 | 
15 |     images = input.get("images");
16 |     text_prompt = input.get("text");
17 |     image_reward_weight = input.get("irw");
18 |     
19 |     scores = []
20 |     for idx in range(0, len(images), vqa_batch_size):
21 |         cur_batch_size = min(vqa_batch_size, len(images) - idx)
22 |         cur_images = images[idx:idx+cur_batch_size]
23 |         cur_scores = vqa_reward_model(images=cur_images, texts=[text_prompt])
24 | 
25 |         if image_reward_weight > 0.0:
26 |             image_reward_score = rm_model.score(text_prompt, cur_images)
27 |             if type(image_reward_score) is float:
28 |                 image_reward_score = [image_reward_score] 
29 |             image_reward_score = torch.tensor(image_reward_score).to(cur_scores)
30 |             cur_scores = cur_scores + image_reward_score[:, None] * image_reward_weight
31 | 
32 |         scores += cur_scores.reshape(-1).cpu().numpy().tolist();
33 | 
34 |     output = {
35 |         "scores": scores
36 |     };
37 | 
38 |     print("Reward calculation took {:.3f}s".format(time.time() - start_time));
39 |     return output;
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser();
44 | 
45 |     parser.add_argument("--gpu", type = str, default = "0");
46 |     parser.add_argument("--addr", type = int, default = 5000);
47 |     parser.add_argument("--vqa_model", type = str, default = "clip-flant5-xxl");
48 |     parser.add_argument("--vqa_batch_size", type = int, default = 32);
49 |     
50 |     args = parser.parse_args();
51 | 
52 |     RemoteVQAManager.register("process_VQA", callable = process_VQA);
53 | 
54 |     device = "cuda:{}".format(args.gpu);
55 |     vqa_batch_size = args.vqa_batch_size;
56 |     vqa_model = args.vqa_model;
57 | 
58 |     vqa_reward_model = t2v_metrics.get_score_model(model = vqa_model, device = device);
59 |     rm_model = RM.load("ImageReward-v1.0", device = device);
60 | 
61 |     manager = RemoteVQAManager(address=("localhost", int(args.addr)), authkey=b"secret")
62 |     server = manager.get_server()
63 |     print("Server started... Listening for requests.")
64 |     server.serve_forever()
65 | 


--------------------------------------------------------------------------------
/rbf/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .logger import SelfLogger
2 | 
3 | LOGGERs = {
4 |     "self": SelfLogger,
5 | }
6 | 


--------------------------------------------------------------------------------
/rbf/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .image import ImageModel
2 | from .image_flux import FluxImageModel
3 | 
4 | MODELs = {
5 |     "image": ImageModel,
6 |     "flux_image": FluxImageModel,
7 | }
8 | 


--------------------------------------------------------------------------------
/rbf/prior/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sd import (
 2 |     StableDiffusionPrior,
 3 | )
 4 | 
 5 | from .flux import FluxPrior
 6 | from .instaflow import InstaFlowPrior
 7 | from .flux_fill import FluxFillPrior
 8 | 
 9 | from .sd2 import SD2Prior
10 | PRIORs = {
11 |     "sd": StableDiffusionPrior,
12 |     "flux": FluxPrior,
13 |     "flux_fill": FluxFillPrior,
14 |     "instaflow": InstaFlowPrior,
15 |     "sd2": SD2Prior,
16 | }
17 | 


--------------------------------------------------------------------------------
/rbf/prior/denoise_schedulers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .scheduler import (
 2 |     CondOTScheduler, 
 3 |     PolynomialConvexScheduler, 
 4 |     VPScheduler, 
 5 |     LinearVPScheduler, 
 6 |     CosineScheduler,
 7 |     GeneralConvexScheduler,
 8 | )
 9 | 
10 | 


--------------------------------------------------------------------------------
/rbf/shared_modules.py:
--------------------------------------------------------------------------------
 1 | dataset = None
 2 | background = None
 3 | model = None
 4 | prior = None
 5 | logger = None
 6 | time_sampler = None
 7 | noise_sampler = None
 8 | corrector = None
 9 | 
10 | OFF_LOG = False
11 | DO_NOT_SAVE_INTERMEDIATE_IMAGES = False
12 | 
13 | def assert_initialized():
14 |     assert (
15 |         dataset is not None
16 |         and background is not None
17 |         and model is not None
18 |         and prior is not None
19 |         and logger is not None
20 |         and time_sampler is not None
21 |         and noise_sampler is not None
22 |     ), "Please initialize the shared modules before using them."


--------------------------------------------------------------------------------
/rbf/time_sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import LinearAnnealingTimeSampler, FluxTimeSampler, SDTimeSampler
2 | 
3 | TIME_SAMPLERs = {
4 |     "linear_annealing": LinearAnnealingTimeSampler,
5 |     "flux_scheduler": FluxTimeSampler,
6 |     "sd_scheduler": SDTimeSampler
7 | }


--------------------------------------------------------------------------------
/rbf/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | from omegaconf import OmegaConf, DictConfig
 3 | 
 4 | 
 5 | def load_config(*yamls: str, cli_args: Optional[list] = None, from_string=False, **kwargs) -> Any:
 6 |     if from_string:
 7 |         yaml_confs = [OmegaConf.create(s) for s in yamls]
 8 |     else:
 9 |         yaml_confs = [OmegaConf.load(f) for f in yamls]
10 |     cli_conf = OmegaConf.from_cli(cli_args)
11 |     cfg = OmegaConf.merge(*yaml_confs, cli_conf, kwargs)
12 |     OmegaConf.resolve(cfg)
13 |     assert isinstance(cfg, DictConfig)
14 | 
15 |     return cfg
16 | 
17 | def fetch_config(self, cfg):
18 |     """
19 |     Fetch dataclass variables to local variables
20 |     self: any class object
21 |     cfg: any dataclass object
22 |     """
23 | 
24 |     for key, value in cfg.items():
25 |         setattr(self, key, value)
26 |     return self
27 | 


--------------------------------------------------------------------------------
/rbf/utils/print_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | print_utils.py
 3 | 
 4 | Utility functions for printing fancy messages.
 5 | """
 6 | 
 7 | import textwrap
 8 | 
 9 | class color:
10 |     purple = '\033[95m'
11 |     cyan = '\033[96m'
12 |     darkcyan = '\033[36m'
13 |     blue = '\033[94m'
14 |     green = '\033[92m'
15 |     yellow = '\033[93m'
16 |     red = '\033[91m'
17 |     bold = '\033[1m'
18 |     end = '\033[0m'
19 | 
20 | def print_with_box(text: str, box_color: str = color.purple, text_color: str = color.end, title: str = "", max_len = 88) -> None:
21 |     """
22 |     Prints a message with a box around it.
23 |     """
24 |     lines = text.split("\n")
25 |     if len(title) > max_len - 3:
26 |         title = title[:max_len - 6] + "..."
27 |     text_len = max([len(line) for line in lines])
28 |     title_len = len(title)
29 |     line_len = min(max_len, max(title_len, text_len))
30 | 
31 |     # if each line is longer than max_len, break it into multiple lines
32 |     new_lines = []
33 |     for line in lines:
34 |         while len(line) > line_len:
35 |             new_lines.append(line[:line_len])
36 |             line = line[line_len:]
37 |         new_lines.append(line)
38 |     lines = new_lines
39 | 
40 |     bar_len = line_len - len(title)
41 |     front_bar_len = bar_len // 2
42 |     back_bar_len = bar_len - front_bar_len
43 |     print(box_color+"╭─" + "─"*front_bar_len + title + "─"*back_bar_len + "─╮"+color.end)
44 |     for line in lines:
45 |         print(box_color+"│ " + text_color + line.ljust(line_len) + box_color + " │"+color.end)
46 |     print(box_color+"╰" + "─" * (line_len + 2) + "╯"+color.end)
47 | 
48 | def print_warning(*args) -> None:
49 |     text = ' '.join(map(str, args))
50 |     print(color.yellow + color.bold + '[Warning] ' + color.end + color.yellow + text + color.end)
51 | 
52 | def print_info(*args) -> None:
53 |     text = ' '.join(map(str, args))
54 |     print(color.green + color.bold + '[Info] ' + color.end + color.green + text + color.end)
55 | 
56 | def print_error(*args) -> None:
57 |     text = ' '.join(map(str, args))
58 |     print(color.red + color.bold + '[Error] ' + color.end + color.red + text + color.end)
59 | 
60 | def print_note(*args) -> None:
61 |     text = ' '.join(map(str, args))
62 |     print(color.cyan + color.bold + '[NOTE] ' + color.end + color.cyan + text + color.end)
63 | 
64 | def print_wrap(text, max_width=100):
65 |     wrapped_text = textwrap.fill(text, width=max_width)
66 |     print(wrapped_text)
67 | 
68 | 
69 | def print_qna(question, response):
70 |     print("************************************")
71 |     print_wrap(f"*** [QUESTION]\n{question}")
72 |     print("************************************")
73 |     print_wrap(f"*** [RESPONSE]\n{response}")
74 |     print("************************************")
75 | 


--------------------------------------------------------------------------------
/rbf/utils/random_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | random_utils.py
 3 | 
 4 | Utility functions for controlling randomness.
 5 | """
 6 | 
 7 | import os
 8 | import numpy as np
 9 | import torch
10 | import random
11 | 
12 | 
13 | def seed_everything(seed=0):
14 |     """
15 |     Seeds the random number generators of Python, Numpy and PyTorch.
16 |     """
17 |     os.environ["PYTHONHASHSEED"] = str(seed)
18 |     random.seed(seed)
19 |     np.random.seed(seed)
20 |     torch.manual_seed(seed)
21 |     torch.cuda.manual_seed(seed)
22 |     torch.cuda.manual_seed_all(seed)
23 |     torch.backends.cudnn.deterministic = True
24 |     torch.backends.cudnn.benchmark = False


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==1.4.0
 2 | diffusers==0.32.2
 3 | einops==0.8.1
 4 | huggingface-hub==0.29.1
 5 | image-reward==1.5
 6 | imageio==2.37.0
 7 | matplotlib==3.10.0
 8 | natsort==8.4.0
 9 | numpy==1.26.3
10 | omegaconf==2.3.0
11 | pillow==11.0.0
12 | scikit-learn==1.6.1
13 | scipy==1.15.2
14 | sentencepiece==0.2.0
15 | supervision==0.25.1
16 | tokenizers==0.21.0
17 | tqdm==4.67.1
18 | transformers==4.49.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     name="rbf",
5 |     version=0.1,
6 |     packages=["rbf"],
7 |     zip_safe=False,
8 | )
9 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/eval.py:
--------------------------------------------------------------------------------
 1 | # Evaluate on all datasets in VQAScore paper
 2 | 
 3 | import argparse
 4 | import os
 5 | import t2v_metrics
 6 | from dataset import Winoground, NaturalBench_Retrieval, EqBen_Mini, StanfordT23D, TIFA160_DSG, Flickr8K_CF, SeeTrue, Pickapic_v1, T2VScore
 7 | 
 8 | 
 9 | def config():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--root_dir", default="./datasets", type=str,
12 |                         help='Root directory for saving datasets.')
13 |     parser.add_argument("--cache_dir", default=t2v_metrics.constants.HF_CACHE_DIR, type=str) 
14 |     parser.add_argument("--device", default="cuda", type=str)
15 |     parser.add_argument("--batch_size", default=16, type=int)
16 |     parser.add_argument("--model", default="clip-flant5-xxl", type=str)
17 |     parser.add_argument("--question", default=None, type=str)
18 |     parser.add_argument("--answer", default=None, type=str)
19 |     return parser.parse_args()
20 | 
21 | def main():
22 |     args = config()
23 |     if not os.path.exists(args.root_dir):
24 |         os.makedirs(args.root_dir)
25 |     
26 |     score_func = t2v_metrics.get_score_model(model=args.model, device=args.device, cache_dir=args.cache_dir)
27 | 
28 |     kwargs = {}
29 |     if args.question is not None:
30 |         print(f"Using question template: {args.question}")
31 |         kwargs['question_template'] = args.question
32 |     if args.answer is not None:
33 |         print(f"Using answer template: {args.answer}")
34 |         kwargs['answer_template'] = args.answer
35 |     
36 |     print(f"Performance of {args.model}.")
37 |     for dataset_cls in [
38 |         Winoground,
39 |         NaturalBench_Retrieval,
40 |         EqBen_Mini,
41 |         TIFA160_DSG,
42 |         Pickapic_v1,
43 |         SeeTrue,
44 |         StanfordT23D,
45 |         T2VScore,
46 |         Flickr8K_CF
47 |     ]:
48 |         
49 |         dataset = dataset_cls(root_dir=args.root_dir)
50 |         scores = score_func.batch_forward(dataset, batch_size=args.batch_size, **kwargs).cpu()
51 |         dataset.evaluate_scores(scores)
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 
56 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/genai_video_eval.py:
--------------------------------------------------------------------------------
 1 | # Evaluate on GenAI-Bench-Video using a specific model
 2 | # Example scripts to run:
 3 | # VQAScore: python genai_video_eval.py --model clip-flant5-xxl
 4 | # CLIPScore: python genai_video_eval.py --model openai:ViT-L-14-336
 5 | import argparse
 6 | import os
 7 | import t2v_metrics
 8 | from dataset import GenAIBench_Video
 9 | import json
10 | import torch
11 | import numpy as np
12 | from genai_image_eval import show_performance_per_skill
13 | 
14 | def config():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--root_dir", default="./datasets", type=str,
17 |                         help='Root directory for saving datasets.')
18 |     parser.add_argument("--cache_dir", default=t2v_metrics.constants.HF_CACHE_DIR, type=str) 
19 |     parser.add_argument("--device", default="cuda", type=str)
20 |     parser.add_argument("--batch_size", default=16, type=int)
21 |     parser.add_argument("--num_prompts", default=800, type=int, choices=[527, 800])
22 |     parser.add_argument("--model", default="clip-flant5-xxl", type=str)
23 |     parser.add_argument("--question", default=None, type=str)
24 |     parser.add_argument("--answer", default=None, type=str)
25 |     parser.add_argument("--result_dir", default="./genai_video_results", type=str)
26 |     parser.add_argument("--eval_mode", default="avg_frames", type=str)
27 |     return parser.parse_args()
28 | 
29 | 
30 | def main():
31 |     args = config()
32 |     if not os.path.exists(args.root_dir):
33 |         os.makedirs(args.root_dir)
34 |     
35 |     os.makedirs(args.result_dir, exist_ok=True)
36 |     result_path = f"{args.result_dir}/{args.model}_{args.eval_mode}_{args.num_prompts}_prompts.pt"
37 |     dataset = GenAIBench_Video(root_dir=args.root_dir, eval_mode=args.eval_mode, num_prompts=args.num_prompts)
38 |     if os.path.exists(result_path):
39 |         print(f"Result file {result_path} already exists. Skipping.")
40 |         scores = torch.load(result_path)
41 |     else:
42 |         score_func = t2v_metrics.get_score_model(model=args.model, device=args.device, cache_dir=args.cache_dir)
43 | 
44 |         kwargs = {}
45 |         if args.question is not None:
46 |             print(f"Using question template: {args.question}")
47 |             kwargs['question_template'] = args.question
48 |         if args.answer is not None:
49 |             print(f"Using answer template: {args.answer}")
50 |             kwargs['answer_template'] = args.answer
51 |         
52 |         print(f"Performance of {args.model} on using {args.eval_mode}.")
53 |         scores = score_func.batch_forward(dataset, batch_size=args.batch_size, **kwargs).cpu()
54 |         torch.save(scores, result_path)
55 |     
56 |     ### Get performance per skill
57 |     our_scores = scores.mean(axis=1)
58 |     show_performance_per_skill(our_scores, dataset, items_name='videos', prompt_to_items_name='prompt_to_videos', print_std=True)
59 |     
60 |     print("Alignment Performance")
61 |     ### Alignment performance
62 |     dataset.evaluate_scores(scores)
63 |     
64 |     
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0.png


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/0/DALLE3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/DALLE3.png


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/0/DeepFloyd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/DeepFloyd.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/0/Midjourney.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/Midjourney.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/0/SDXL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/SDXL.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1.png


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/1/DALLE3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/DALLE3.png


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/1/DeepFloyd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/DeepFloyd.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/1/Midjourney.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/Midjourney.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/1/SDXL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/SDXL.jpg


--------------------------------------------------------------------------------
/third-party/t2v_metrics/images/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/example.png


--------------------------------------------------------------------------------
/third-party/t2v_metrics/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "t2v_metrics"
 7 | version = "1.2"
 8 | description = "Evaluating Text-to-Visual Generation with Image-to-Text Generation."
 9 | authors = [
10 |     {name="Zhiqiu Lin", email="zl279@cornell.edu"},
11 | ]
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: Apache Software License",
17 | ]
18 | dependencies = [
19 |     "ftfy>=6.1.1",
20 |     "tqdm>=4.64.1",
21 |     "gdown>=4.7.1",
22 |     "huggingface-hub>=0.19.4",
23 |     "open-clip-torch>=2.23.0",
24 |     "openai>=1.29.0",
25 |     "opencv-python>=4.11.0.86",
26 |     "opencv-python-headless",
27 |     "pandas>=2.1.4",
28 |     "scipy>=1.11.4",
29 |     "sentencepiece>=0.1.99",
30 |     "transformers>=4.48.1",
31 |     "datasets>=2.15.0",
32 |     "tokenizers",
33 |     "omegaconf",
34 |     "iopath",
35 |     "fairscale",
36 |     # for clipscore
37 |     "scikit-learn",
38 |     "pycocoevalcap",
39 |     "image-reward",
40 |     "hpsv2",
41 |     "fire==0.4.0",
42 |     "tiktoken>=0.7.0",
43 | ]
44 | 
45 | [tool.setuptools]
46 | include-package-data = true
47 | packages = ["t2v_metrics", "t2v_metrics.models"]
48 | 
49 | [tool.setuptools.package-data]
50 | 't2v_metrics' = ['**/*.json', '**/*.yaml', '**/*.py']
51 | 
52 | [project.urls]
53 | Home = "https://linzhiqiu.github.io/papers/vqascore"
54 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | datasets
 3 | scipy
 4 | sentencepiece
 5 | gdown
 6 | tqdm
 7 | ftfy
 8 | regex
 9 | git+https://github.com/openai/CLIP.git
10 | open-clip-torch
11 | opencv-python
12 | opencv-python-headless
13 | openai
14 | einops
15 | pandas
16 | # salesforce-lavis


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | from .constants import HF_CACHE_DIR
 7 | from .vqascore import VQAScore, list_all_vqascore_models
 8 | from .clipscore import CLIPScore, list_all_clipscore_models
 9 | from .itmscore import ITMScore, list_all_itmscore_models
10 | 
11 | def list_all_models():
12 |     return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
13 | 
14 | def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=HF_CACHE_DIR, **kwargs):
15 |     if model in list_all_vqascore_models():
16 |         return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
17 |     elif model in list_all_clipscore_models():
18 |         return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
19 |     elif model in list_all_itmscore_models():
20 |         return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
21 |     else:
22 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/clipscore.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .score import Score
 4 | 
 5 | from .constants import HF_CACHE_DIR
 6 | 
 7 | from .models.clipscore_models import list_all_clipscore_models, get_clipscore_model
 8 | 
 9 | class CLIPScore(Score):
10 |     def prepare_scoremodel(self,
11 |                            model='openai:ViT-L/14',
12 |                            device='cuda',
13 |                            cache_dir=HF_CACHE_DIR):
14 |         return get_clipscore_model(
15 |             model,
16 |             device=device,
17 |             cache_dir=cache_dir
18 |         )
19 |             
20 |     def list_all_models(self) -> List[str]:
21 |         return list_all_clipscore_models()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/constants.py:
--------------------------------------------------------------------------------
1 | HF_CACHE_DIR = "./hf_cache/" # TODO: change this to your own cache dir
2 | 
3 | # For CLIP-FlanT5 and LLaVA-1.5 (copied from llava)
4 | CONTEXT_LEN = 2048
5 | SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
6 | IGNORE_INDEX = -100
7 | IMAGE_TOKEN_INDEX = -200
8 | DEFAULT_IMAGE_TOKEN = "<image>"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/itmscore.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .score import Score
 4 | 
 5 | from .constants import HF_CACHE_DIR
 6 | 
 7 | from .models.itmscore_models import list_all_itmscore_models, get_itmscore_model
 8 | 
 9 | class ITMScore(Score):
10 |     def prepare_scoremodel(self,
11 |                            model='blip2-itm',
12 |                            device='cuda',
13 |                            cache_dir=HF_CACHE_DIR):
14 |         return get_itmscore_model(
15 |             model,
16 |             device=device,
17 |             cache_dir=cache_dir
18 |         )
19 |             
20 |     def list_all_models(self) -> List[str]:
21 |         return list_all_itmscore_models()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/__init__.py


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/clipscore_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clip_model import CLIP_MODELS, CLIPScoreModel
 2 | from .blip2_itc_model import BLIP2_ITC_MODELS, BLIP2ITCScoreModel
 3 | from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel
 4 | from .pickscore_model import PICKSCORE_MODELS, PickScoreModel
 5 | from ...constants import HF_CACHE_DIR
 6 | 
 7 | ALL_CLIP_MODELS = [
 8 |     CLIP_MODELS,
 9 |     BLIP2_ITC_MODELS,
10 |     HPSV2_MODELS,
11 |     PICKSCORE_MODELS,
12 | ]
13 | 
14 | def list_all_clipscore_models():
15 |     return [model for models in ALL_CLIP_MODELS for model in models]
16 | 
17 | def get_clipscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR):
18 |     assert model_name in list_all_clipscore_models()
19 |     if model_name in CLIP_MODELS:
20 |         return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir)
21 |     elif model_name in BLIP2_ITC_MODELS:
22 |         return BLIP2ITCScoreModel(model_name, device=device, cache_dir=cache_dir)
23 |     elif model_name in HPSV2_MODELS:
24 |         return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir)
25 |     elif model_name in PICKSCORE_MODELS:
26 |         return PickScoreModel(model_name, device=device, cache_dir=cache_dir)
27 |     else:
28 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/clipscore_models/clip_model.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import List
 3 | import torch
 4 | import open_clip
 5 | 
 6 | from ..model import ScoreModel
 7 | from ...constants import HF_CACHE_DIR
 8 | 
 9 | CLIP_MODELS = [f"{pretrained}:{arch}" for arch, pretrained in open_clip.list_pretrained()]
10 | 
11 | class CLIPScoreModel(ScoreModel):
12 |     "A wrapper for OpenCLIP models (including openAI's CLIP, OpenCLIP, DatacompCLIP)"
13 |     def __init__(self,
14 |                  model_name='openai:ViT-L-14',
15 |                  device='cuda',
16 |                  cache_dir=HF_CACHE_DIR):
17 |         assert model_name in CLIP_MODELS
18 |         super().__init__(model_name=model_name,
19 |                          device=device,
20 |                          cache_dir=cache_dir)
21 |     
22 |     def load_model(self):
23 |         """Load the model, tokenizer, image transform
24 |         """
25 |         self.pretrained, self.arch = self.model_name.split(':')
26 |         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
27 |             self.arch,
28 |             pretrained=self.pretrained,
29 |             device=self.device,
30 |             cache_dir=self.cache_dir
31 |         )
32 |         self.tokenizer = open_clip.get_tokenizer(self.arch)
33 |         self.model.eval()
34 |     
35 |     def load_images(self,
36 |                     image: List[str]) -> torch.Tensor:
37 |         """Load the image(s), and return a tensor (after preprocessing) put on self.device
38 |         """
39 |         image = [self.image_loader(x) for x in image]
40 |         image = [self.preprocess(x) for x in image]
41 |         image = torch.stack(image, dim=0).to(self.device)
42 |         return image
43 |     
44 |     @torch.no_grad()
45 |     def forward(self,
46 |                 images: List[str],
47 |                 texts: List[str]) -> torch.Tensor:
48 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
49 |         """
50 |         assert len(images) == len(texts)
51 |         image = self.load_images(images)
52 |         text = self.tokenizer(texts).to(self.device)
53 |         image_features = self.model.encode_image(image)
54 |         image_features /= image_features.norm(dim=-1, keepdim=True)
55 |         text_features = self.model.encode_text(text)
56 |         text_features /= text_features.norm(dim=-1, keepdim=True)
57 |         
58 |         # return cosine similarity as scores
59 |         return (image_features * text_features).sum(dim=-1)


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import torch
 3 | 
 4 | from ..model import ScoreModel
 5 | from ...constants import HF_CACHE_DIR
 6 | 
 7 | HPSV2_MODELS = ['hpsv2']
 8 | 
 9 | class HPSV2ScoreModel(ScoreModel):
10 |     "A wrapper for HPSv2 models "
11 |     def __init__(self,
12 |                  model_name='openai:ViT-L-14',
13 |                  device='cuda',
14 |                  cache_dir=HF_CACHE_DIR):
15 |         assert model_name in HPSV2_MODELS
16 |         super().__init__(model_name=model_name,
17 |                          device=device,
18 |                          cache_dir=cache_dir)
19 |     
20 |     def load_model(self):
21 |         """Load the model, tokenizer, image transform
22 |         """
23 |         import hpsv2
24 |         self.hpsv2 = hpsv2
25 |     
26 |     def load_images(self,
27 |                     image: List[str]):
28 |         """Load the image(s), and return a tensor (after preprocessing) put on self.device
29 |         """
30 |         images = [self.image_loader(x) for x in image]
31 |         return images
32 |     
33 |     @torch.no_grad()
34 |     def forward(self,
35 |                 images: List[str],
36 |                 texts: List[str]) -> torch.Tensor:
37 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
38 |         """
39 |         assert len(images) == len(texts)
40 |         images = self.load_images(images)
41 |         scores = torch.zeros(len(images), dtype=torch.float16).to(self.device)
42 |         for i in range(len(images)):
43 |             caption = texts[i]
44 |             image = images[i]
45 |             scores[i] = float(self.hpsv2.score(image, caption)[0])
46 |         
47 |         # return cosine similarity as scores
48 |         return scores


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/clipscore_models/pickscore_model.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import List
 3 | import torch
 4 | from transformers import AutoProcessor, AutoModel
 5 | from PIL import Image
 6 | 
 7 | from ..model import ScoreModel
 8 | from ...constants import HF_CACHE_DIR
 9 | 
10 | PICKSCORE_MODELS = ['pickscore-v1']
11 | 
12 | class PickScoreModel(ScoreModel):
13 |     "A wrapper for PickScore models"
14 |     def __init__(self,
15 |                  model_name='pickscore-v1',
16 |                  device='cuda',
17 |                  cache_dir=HF_CACHE_DIR):
18 |         assert model_name in PICKSCORE_MODELS
19 |         super().__init__(model_name=model_name,
20 |                          device=device,
21 |                          cache_dir=cache_dir)
22 |     
23 |     def load_model(self):
24 |         """Load the model, tokenizer, image transform
25 |         """
26 |         assert self.model_name == 'pickscore-v1'
27 |         processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
28 |         model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"
29 | 
30 |         self.processor = AutoProcessor.from_pretrained(processor_name_or_path)
31 |         self.model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(self.device)
32 | 
33 |     def load_images(self,
34 |                     image: List[str]) -> torch.Tensor:
35 |         """Load the image(s), and return a tensor (no preprocessing!!) put on self.device
36 |         """
37 |         image = [self.image_loader(x) for x in image]
38 |         image = self.processor(images=image, padding=True, truncation=True, max_length=77, return_tensors="pt").to(self.device)
39 |         # image = torch.stack(image, dim=0).to(self.device)
40 |         return image
41 |     
42 |     @torch.no_grad()
43 |     def forward(self,
44 |                 images: List[str],
45 |                 texts: List[str]) -> torch.Tensor:
46 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
47 |         """
48 |         assert len(images) == len(texts)
49 |         image = self.load_images(images)
50 |         text_inputs = self.processor(
51 |             text=texts,
52 |             padding=True,
53 |             truncation=True,
54 |             max_length=77,
55 |             return_tensors="pt",
56 |         ).to(self.device)
57 |         
58 |         # embed
59 |         image_embs = self.model.get_image_features(**image)
60 |         image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
61 |     
62 |         text_embs = self.model.get_text_features(**text_inputs)
63 |         text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
64 |     
65 |         # score
66 |         scores = (image_embs * text_embs).sum(dim=-1)
67 |         return scores
68 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/itmscore_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .blip2_itm_model import BLIP2_ITM_MODELS, BLIP2ITMScoreModel
 2 | from .image_reward_model import IMAGE_REWARD_MODELS, ImageRewardScoreModel
 3 | from ...constants import HF_CACHE_DIR
 4 | 
 5 | ALL_ITM_MODELS = [
 6 |     BLIP2_ITM_MODELS,
 7 |     IMAGE_REWARD_MODELS,
 8 | ]
 9 | 
10 | def list_all_itmscore_models():
11 |     return [model for models in ALL_ITM_MODELS for model in models]
12 | 
13 | def get_itmscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR):
14 |     assert model_name in list_all_itmscore_models()
15 |     if model_name in BLIP2_ITM_MODELS:
16 |         return BLIP2ITMScoreModel(model_name, device=device, cache_dir=cache_dir)
17 |     elif model_name in IMAGE_REWARD_MODELS:
18 |         return ImageRewardScoreModel(model_name, device=device, cache_dir=cache_dir)
19 |     else:
20 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/itmscore_models/image_reward_model.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import torch
 3 | import os
 4 | from torchvision import transforms
 5 | 
 6 | import ImageReward as reward
 7 | from ..model import ScoreModel
 8 | from ...constants import HF_CACHE_DIR
 9 | 
10 | IMAGE_REWARD_MODELS = {
11 |     'image-reward-v1': {'variant': "ImageReward-v1.0"},
12 | }
13 | 
14 | class ImageRewardScoreModel(ScoreModel):
15 |     "A wrapper for ImageReward ITMScore (finetuned on human preference) models"
16 |     def __init__(self,
17 |                  model_name='image-reward-v1',
18 |                  device='cuda',
19 |                  cache_dir=HF_CACHE_DIR):
20 |         assert model_name in IMAGE_REWARD_MODELS, f"Model name must be one of {IMAGE_REWARD_MODELS.keys()}"
21 |         os.environ['TORCH_HOME'] = cache_dir
22 |         import timm.models.hub as timm_hub
23 |         super().__init__(model_name=model_name,
24 |                          device=device,
25 |                          cache_dir=cache_dir)
26 |     
27 |     def load_model(self):
28 |         """Load the model, tokenizer, image transform
29 |         """
30 |         self.variant = IMAGE_REWARD_MODELS[self.model_name]['variant']
31 |         self.model = reward.load(self.variant).to(self.device).eval()
32 |     
33 |     def load_images(self,
34 |                     image: List[str]) -> torch.Tensor:
35 |         """Load the image(s), and return a tensor (after preprocessing) put on self.device
36 |         """
37 |         image = [self.image_loader(x) for x in image]
38 |         image = [self.model.preprocess(image) for image in image]
39 |         assert all(x.shape == image[0].shape for x in image)
40 |         image = torch.stack(image, dim=0).to(self.device)
41 |         return image
42 |     
43 |     @torch.no_grad()
44 |     def forward(self,
45 |                 images: List[str],
46 |                 texts: List[str]) -> torch.Tensor:
47 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
48 |         """
49 |         assert len(images) == len(texts), "Number of images and texts must match"
50 |         rewards = torch.zeros(len(texts), dtype=torch.float32).to(self.device)
51 |         images = self.load_images(images)
52 |         for index in range(len(texts)):
53 |             text_input = self.model.blip.tokenizer(
54 |                 texts[index], padding='max_length', 
55 |                 truncation=True, max_length=35, return_tensors="pt").to(self.device)
56 |             image_embeds = self.model.blip.visual_encoder(images[index].unsqueeze(0))
57 |             image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(self.device)
58 |             text_output = self.model.blip.text_encoder(
59 |                 text_input.input_ids,
60 |                 attention_mask = text_input.attention_mask,
61 |                 encoder_hidden_states = image_embeds,
62 |                 encoder_attention_mask = image_atts,
63 |                 return_dict = True,
64 |             )
65 |             
66 |             txt_features = text_output.last_hidden_state[:,0,:].float() # (feature_dim)
67 |             reward_score = self.model.mlp(txt_features)
68 |             reward_score = (reward_score - self.model.mean) / self.model.std
69 |             
70 |             rewards[index] = reward_score
71 |         
72 |         return rewards


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | import os
 4 | import torch
 5 | import numpy as np
 6 | from PIL import Image
 7 |     
 8 | from ..constants import HF_CACHE_DIR
 9 | 
10 | 
11 | 
12 | def image_loader(image_path):
13 |     # >>> ADDED #########################################################
14 |     if type(image_path) == Image.Image:
15 |         return image_path.convert("RGB")
16 |     # <<< ADDED #########################################################
17 |     elif image_path.split('.')[-1] == 'npy':
18 |         return Image.fromarray(np.load(image_path)[:, :, [2, 1, 0]], 'RGB')
19 |     else:
20 |         return Image.open(image_path).convert("RGB")
21 | 
22 | class ScoreModel(ABC):
23 |     def __init__(self,
24 |                  model_name='clip-flant5-xxl',
25 |                  device='cuda',
26 |                  cache_dir=HF_CACHE_DIR):
27 |         self.model_name = model_name
28 |         self.device = device
29 |         self.cache_dir = cache_dir
30 |         if not os.path.exists(self.cache_dir):
31 |             os.makedirs(self.cache_dir)
32 |         self.image_loader = image_loader
33 |         self.load_model()
34 | 
35 |     @abstractmethod
36 |     def load_model(self):
37 |         """Load the model, tokenizer, and etc.
38 |         """
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def load_images(self,
43 |                     image: List[str]) -> torch.Tensor: 
44 |         """Load the image(s), and return a tensor (after preprocessing) put on self.device
45 |         """
46 |         pass
47 | 
48 |     @abstractmethod
49 |     def forward(self,
50 |                 images: List[str],
51 |                 texts: List[str]) -> torch.Tensor:
52 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
53 |         """
54 |         pass


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clip_t5_model import CLIP_T5_MODELS, CLIPT5Model
 2 | from .llava_model import LLAVA_MODELS, LLaVAModel
 3 | from .llava16_model import LLAVA16_MODELS, LLaVA16Model
 4 | from .instructblip_model import InstructBLIP_MODELS, InstructBLIPModel
 5 | from .gpt4v_model import GPT4V_MODELS, GPT4VModel
 6 | from ...constants import HF_CACHE_DIR
 7 | 
 8 | ALL_VQA_MODELS = [
 9 |     CLIP_T5_MODELS,
10 |     LLAVA_MODELS,
11 |     LLAVA16_MODELS,
12 |     InstructBLIP_MODELS,
13 |     GPT4V_MODELS,
14 | ]
15 | 
16 | def list_all_vqascore_models():
17 |     return [model for models in ALL_VQA_MODELS for model in models]
18 | 
19 | def get_vqascore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwargs):
20 |     assert model_name in list_all_vqascore_models()
21 |     if model_name in CLIP_T5_MODELS:
22 |         return CLIPT5Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
23 |     elif model_name in LLAVA_MODELS:
24 |         return LLaVAModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
25 |     elif model_name in LLAVA16_MODELS:
26 |         return LLaVA16Model(model_name, device=device, cache_dir=cache_dir, **kwargs)
27 |     elif model_name in InstructBLIP_MODELS:
28 |         return InstructBLIPModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
29 |     elif model_name in GPT4V_MODELS:
30 |         return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs)
31 |     else:
32 |         raise NotImplementedError()


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.clip_t5 import CLIPT5ForConditionalGeneration, CLIPT5Config, ModelArguments


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         else:
20 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
21 | 
22 |     def load_model(self):
23 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
24 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
25 |         self.vision_tower.requires_grad_(False)
26 | 
27 |         self.is_loaded = True
28 | 
29 |     def feature_select(self, image_forward_outs):
30 |         image_features = image_forward_outs.hidden_states[self.select_layer]
31 |         if self.select_feature == 'patch':
32 |             image_features = image_features[:, 1:]
33 |         elif self.select_feature == 'cls_patch':
34 |             image_features = image_features
35 |         else:
36 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
37 |         return image_features
38 | 
39 |     @torch.no_grad()
40 |     def forward(self, images):
41 |         if type(images) is list:
42 |             image_features = []
43 |             for image in images:
44 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
45 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
46 |                 image_features.append(image_feature)
47 |         else:
48 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
49 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
50 | 
51 |         return image_features
52 | 
53 |     @property
54 |     def dummy_feature(self):
55 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
56 | 
57 |     @property
58 |     def dtype(self):
59 |         return self.vision_tower.dtype
60 | 
61 |     @property
62 |     def device(self):
63 |         return self.vision_tower.device
64 | 
65 |     @property
66 |     def config(self):
67 |         if self.is_loaded:
68 |             return self.vision_tower.config
69 |         else:
70 |             return self.cfg_only
71 | 
72 |     @property
73 |     def hidden_size(self):
74 |         return self.config.hidden_size
75 | 
76 |     @property
77 |     def num_patches(self):
78 |         return (self.config.image_size // self.config.patch_size) ** 2
79 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from .common.registry import registry
14 | 
15 | from .models import *
16 | from .processors import *
17 | 
18 | 
19 | root_dir = os.path.dirname(os.path.abspath(__file__))
20 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
21 | 
22 | registry.register_path("library_root", root_dir)
23 | repo_root = os.path.join(root_dir, "..")
24 | registry.register_path("repo_root", repo_root)
25 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
26 | registry.register_path("cache_root", cache_root)
27 | 
28 | registry.register("MAX_INT", sys.maxsize)
29 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
30 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/aokvqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   aok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
17 |           storage:
18 |               - aokvqa/annotations/aokvqa_v1p0_train.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
22 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
23 |           storage:
24 |               - aokvqa/annotations/aokvqa_v1p0_val.json
25 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
26 |               # - aokvqa/annotations/large_vocab_train_lavis.json
27 |         test:
28 |           url:
29 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
31 |           storage:
32 |               - aokvqa/annotations/aokvqa_v1p0_test.json
33 |               - aokvqa/annotations/specialized_vocab_train_lavis.json
34 |       images:
35 |           storage: coco/images/
36 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   avsd_dialogue: # name of the dataset builder
 8 |     dataset_card: dataset_card/avsd_dialogue.md 
 9 |     data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 |           storage: avsd/annotations/train.json 
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 |           storage: avsd/annotations/val.json 
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 |           storage: avsd/annotations/test.json 
23 |       features:
24 |         storage: avsd/features/ 
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_caption: # name of the dataset builder
 8 |     dataset_card: dataset_card/coco_caption.md
 9 |     # data_dir: ${env.data_dir}/datasets
10 |     data_type: images # [images|videos|features]
11 | 
12 |     build_info:
13 |       # Be careful not to append minus sign (-) before split to avoid itemizing
14 |       annotations:
15 |         train:
16 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 |           md5: aa31ac474cf6250ebb81d18348a07ed8
18 |           storage: coco/annotations/coco_karpathy_train.json
19 |         val:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 |           md5: b273847456ef5580e33713b1f7de52a0
22 |           storage:  coco/annotations/coco_karpathy_val.json
23 |         test:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 |           storage: coco/annotations/coco_karpathy_test.json
27 |       images:
28 |         storage: coco/images/
29 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json
18 |           storage:
19 |               - coco/annotations/vqa_train.json
20 |               - coco/annotations/vqa_val.json
21 |         val:
22 |           url:
23 |               # TODO make this order insensitive
24 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
25 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
27 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
28 |           storage:
29 |               - coco/annotations/vqa_val_eval.json
30 |               - coco/annotations/answer_list.json
31 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
32 |               - coco/annotations/v2_mscoco_val2014_annotations.json
33 |         test:
34 |           url:
35 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json
36 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
37 |           storage:
38 |               - coco/annotations/vqa_test.json
39 |               - coco/annotations/answer_list.json
40 |       images:
41 |           storage: coco/images/
42 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               # TODO make this order insensitive
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
18 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
19 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
20 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
21 |           storage:
22 |               - coco/annotations/vqa_val_eval.json
23 |               - coco/annotations/answer_list.json
24 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
25 |               - coco/annotations/v2_mscoco_val2014_annotations.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   didemo_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 |           storage: didemo/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 |           storage: didemo/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 |           storage: didemo/annotations/retrieval_test.json
23 |       videos:
24 |         storage: didemo/videos
25 |         # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_testdev.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |             - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
22 |           storage:
23 |             - gqa/annotations/testdev_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22 |           storage:
23 |               - gqa/annotations/val_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json
17 |               - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json
18 |           storage:
19 |               - gqa/annotations/train_all_questions_0.json
20 |               - gqa/annotations/val_all_questions.json
21 |         val:
22 |           url:
23 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
24 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
25 |           storage:
26 |               - aokvqa/annotations/aokvqa_v1p0_val.json
27 |               - aokvqa/annotations/large_vocab_train_lavis.json
28 |         test:
29 |           url:
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
31 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json
32 |           storage:
33 |               - aokvqa/annotations/aokvqa_v1p0_test.json
34 |               - aokvqa/annotations/large_vocab_train_lavis.json
35 |       images:
36 |           storage: gqa/images/
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json
16 |           storage: msrvtt/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json
19 |           storage: msrvtt/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json
22 |           storage: msrvtt/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json
25 |           storage: msrvtt/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msrvtt/videos
28 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 |           storage: msrvtt/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 |           storage: msrvtt/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 |           storage: msrvtt/annotations/retrieval_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16 |           storage: msvd/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19 |           storage: msvd/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22 |           storage: msvd/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25 |           storage: msvd/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msvd/videos
28 | 
29 |       instance_id_key: question_id
30 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/okvqa/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   ok_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               # TODO make this order insensitive
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json
18 |               # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json
19 |               # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json
20 |           storage:
21 |               - okvqa/annotations/okvqa_train.json
22 |               # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json
23 |               # - okvqa/annotations/mscoco_train2014_annotations.json
24 |         test:
25 |           url:
26 |               # TODO make this order insensitive
27 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json
28 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json
29 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json
30 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json
31 |           storage:
32 |               - okvqa/annotations/vqa_val_eval.json
33 |               - okvqa/annotations/answer_list.json
34 |               - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json
35 |               - okvqa/annotations/mscoco_val2014_annotations.json
36 |       images:
37 |           storage: coco/images/
38 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 |           storage: vatex/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 |           storage: vatex/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 |           storage: vatex/annotations/cap_test.json
23 |       videos:
24 |         storage: /export/share/dongxuli/data/vatex
25 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/export/home/.cache/lavis"
11 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xxl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xxl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "lmsys/vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "lmsys/vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_no_prefix_iter_100000/20231015004/checkpoint_80000.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_prefix_iter_100000/20231015004/checkpoint_80000.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, salesforce.com, inc.
 2 | # All rights reserved.
 3 | # SPDX-License-Identifier: BSD-3-Clause
 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |  arch: blip2_vicuna13b
 8 |  load_finetuned: False
 9 |  load_pretrained: True
10 | 
11 |  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna13b.pth"
12 |  finetuned: ""
13 | 
14 |  # vit encoder
15 |  image_size: 224
16 |  drop_path_rate: 0
17 |  use_grad_checkpoint: False
18 |  vit_precision: "fp16"
19 |  freeze_vit: True
20 | 
21 |  # Q-Former
22 |  num_query_token: 32
23 | 
24 |  # path to Vicuna checkpoint
25 |  llm_model: "lmsys/vicuna-13b-v1.1"
26 | 
27 |  # generation configs
28 |  prompt: ""
29 | 
30 | 
31 | preprocess:
32 |    vis_processor:
33 |        train:
34 |          name: "blip2_image_train"
35 |          image_size: 224
36 |        eval:
37 |          name: "blip_image_eval"
38 |          image_size: 224
39 |    text_processor:
40 |        train:
41 |          name: "blip_caption"
42 |        eval:
43 |          name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna7b.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "lmsys/vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-32
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 336
53 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: img2prompt_vqa
 8 |   model_type: base
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_generation_moodel:
47 |     pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth"
48 | 
49 | 
50 | 
51 | preprocess:
52 |   vis_processor:
53 |       eval:
54 |         name: "blip_image_eval"
55 |         image_size: 384
56 |   text_processor:
57 |       eval:
58 |         name: "blip_caption"
59 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_answering_model:
47 |     arch: pnp_unifiedqav2_fid
48 | 
49 |     pretrained: "allenai/unifiedqa-v2-t5-3b-1363200"
50 | 
51 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json"
52 | 
53 | preprocess:
54 |   vis_processor:
55 |       eval:
56 |         name: "blip_image_eval"
57 |         image_size: 384
58 |   text_processor:
59 |       eval:
60 |         name: "blip_caption"
61 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 |   question_answering_model:
46 |     arch: pnp_unifiedqav2_fid
47 | 
48 |     pretrained: "allenai/unifiedqa-v2-t5-base-1363200"
49 | 
50 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json"
51 | 
52 | preprocess:
53 |   vis_processor:
54 |       eval:
55 |         name: "blip_image_eval"
56 |         image_size: 384
57 |   text_processor:
58 |       eval:
59 |         name: "blip_caption"
60 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 |   image_question_matching_model:
11 |     arch: blip_image_text_matching
12 |     load_finetuned: True
13 | 
14 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth"
15 | 
16 |     # vit encoder
17 |     vit_type: "large"
18 |     vit_grad_ckpt: False
19 |     vit_ckpt_layer: 0
20 | 
21 |     image_size: 384
22 | 
23 |     # bert config
24 |     med_config_path: "configs/models/med_large_config.json"
25 | 
26 |     embed_dim: 256
27 | 
28 |   image_captioning_model:
29 |     arch: blip_caption
30 |     load_finetuned: True
31 | 
32 |     finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth"
33 | 
34 |     vit_type: "large"
35 |     vit_grad_ckpt: True
36 |     vit_ckpt_layer: 5
37 | 
38 |     image_size: 384
39 | 
40 |     # bert config
41 |     med_config_path: "configs/models/med_large_config.json"
42 | 
43 |     # generation configs
44 |     prompt: "a picture of "
45 | 
46 |   question_answering_model:
47 |     arch: pnp_unifiedqav2_fid
48 | 
49 |     pretrained: "allenai/unifiedqa-v2-t5-large-1363200"
50 | 
51 |     t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json"
52 | 
53 | preprocess:
54 |   vis_processor:
55 |       eval:
56 |         name: "blip_image_eval"
57 |         image_size: 384
58 |   text_processor:
59 |       eval:
60 |         name: "blip_caption"
61 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 16384,
 6 |   "d_kv": 128,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 24,
21 |   "num_heads": 32,
22 |   "num_layers": 24,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "torch_dtype": "float32",
57 |   "transformers_version": "4.21.3",
58 |   "use_cache": true,
59 |   "vocab_size": 32128
60 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 3072,
 6 |   "d_kv": 64,
 7 |   "d_model": 768,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 12,
21 |   "num_heads": 12,
22 |   "num_layers": 12,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "T5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 4096,
 6 |   "d_kv": 64,
 7 |   "d_model": 1024,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "relu",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "relu",
13 |   "gradient_checkpointing": false,
14 |   "initializer_factor": 1.0,
15 |   "is_encoder_decoder": true,
16 |   "is_gated_act": false,
17 |   "layer_norm_epsilon": 1e-06,
18 |   "model_type": "t5",
19 |   "n_positions": 512,
20 |   "num_decoder_layers": 24,
21 |   "num_heads": 16,
22 |   "num_layers": 24,
23 |   "output_past": true,
24 |   "pad_token_id": 0,
25 |   "relative_attention_max_distance": 128,
26 |   "relative_attention_num_buckets": 32,
27 |   "task_specific_params": {
28 |     "summarization": {
29 |       "early_stopping": true,
30 |       "length_penalty": 2.0,
31 |       "max_length": 200,
32 |       "min_length": 30,
33 |       "no_repeat_ngram_size": 3,
34 |       "num_beams": 4,
35 |       "prefix": "summarize: "
36 |     },
37 |     "translation_en_to_de": {
38 |       "early_stopping": true,
39 |       "max_length": 300,
40 |       "num_beams": 4,
41 |       "prefix": "translate English to German: "
42 |     },
43 |     "translation_en_to_fr": {
44 |       "early_stopping": true,
45 |       "max_length": 300,
46 |       "num_beams": 4,
47 |       "prefix": "translate English to French: "
48 |     },
49 |     "translation_en_to_ro": {
50 |       "early_stopping": true,
51 |       "max_length": 300,
52 |       "num_beams": 4,
53 |       "prefix": "translate English to Romanian: "
54 |     }
55 |   },
56 |   "transformers_version": "4.21.3",
57 |   "use_cache": true,
58 |   "vocab_size": 32128
59 | }


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/albef_models/albef_outputs.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from dataclasses import dataclass
 9 | from typing import Optional
10 | 
11 | import torch
12 | from transformers.modeling_outputs import (
13 |     BaseModelOutputWithPoolingAndCrossAttentions,
14 |     CausalLMOutputWithCrossAttentions,
15 |     ModelOutput,
16 | )
17 | 
18 | 
19 | @dataclass
20 | class AlbefSimilarity(ModelOutput):
21 |     sim_i2t: torch.FloatTensor = None
22 |     sim_t2i: torch.FloatTensor = None
23 | 
24 |     sim_i2t_m: Optional[torch.FloatTensor] = None
25 |     sim_t2i_m: Optional[torch.FloatTensor] = None
26 | 
27 |     sim_i2t_targets: Optional[torch.FloatTensor] = None
28 |     sim_t2i_targets: Optional[torch.FloatTensor] = None
29 | 
30 | 
31 | @dataclass
32 | class AlbefIntermediateOutput(ModelOutput):
33 |     # uni-modal features
34 |     image_embeds: torch.FloatTensor = None
35 |     text_embeds: Optional[torch.FloatTensor] = None
36 | 
37 |     image_embeds_m: Optional[torch.FloatTensor] = None
38 |     text_embeds_m: Optional[torch.FloatTensor] = None
39 | 
40 |     # intermediate outputs of multimodal encoder
41 |     encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
42 |     encoder_output_m: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
43 |     encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None
44 | 
45 |     itm_logits: Optional[torch.FloatTensor] = None
46 |     itm_labels: Optional[torch.LongTensor] = None
47 | 
48 |     # intermediate outputs of multimodal decoder
49 |     decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None
50 |     decoder_labels: Optional[torch.LongTensor] = None
51 | 
52 | 
53 | @dataclass
54 | class AlbefOutput(ModelOutput):
55 |     # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional.
56 |     sims: Optional[AlbefSimilarity] = None
57 | 
58 |     intermediate_output: AlbefIntermediateOutput = None
59 | 
60 |     loss: Optional[torch.FloatTensor] = None
61 | 
62 |     loss_itc: Optional[torch.FloatTensor] = None
63 | 
64 |     loss_itm: Optional[torch.FloatTensor] = None
65 | 
66 |     loss_mlm: Optional[torch.FloatTensor] = None
67 | 
68 | 
69 | @dataclass
70 | class AlbefOutputWithLogits(AlbefOutput):
71 |     logits: torch.FloatTensor = None
72 |     logits_m: torch.FloatTensor = None
73 | 
74 | 
75 | @dataclass
76 | class AlbefOutputFeatures(ModelOutput):
77 |     """
78 |     Data class of features from AlbefFeatureExtractor.
79 | 
80 |     Args:
81 |         image_embeds: `torch.FloatTensor` of shape `(batch_size, num_patches+1, embed_dim)`, `optional`
82 |         image_features: `torch.FloatTensor` of shape `(batch_size, num_patches+1, feature_dim)`, `optional`
83 |         text_embeds: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, embed_dim)`, `optional`
84 |         text_features: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, feature_dim)`, `optional`
85 | 
86 |         The first embedding or feature is for the [CLS] token.
87 | 
88 |         Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space.
89 |     """
90 | 
91 |     image_embeds: Optional[torch.FloatTensor] = None
92 |     image_embeds_proj: Optional[torch.FloatTensor] = None
93 | 
94 |     text_embeds: Optional[torch.FloatTensor] = None
95 |     text_embeds_proj: Optional[torch.FloatTensor] = None
96 | 
97 |     multimodal_embeds: Optional[torch.FloatTensor] = None
98 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import logging
 9 | import os
10 | 
11 | import torch
12 | from ...common.dist_utils import download_cached_file
13 | from ...common.utils import is_url
14 | from ...models.base_model import BaseModel
15 | from ...models.vit import interpolate_pos_embed
16 | from transformers import BertTokenizer
17 | 
18 | 
19 | class BlipBase(BaseModel):
20 |     @classmethod
21 |     def init_tokenizer(cls):
22 |         tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
23 |         tokenizer.add_special_tokens({"bos_token": "[DEC]"})
24 |         tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
25 |         tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
26 |         return tokenizer
27 | 
28 |     def load_from_pretrained(self, url_or_filename):
29 |         if is_url(url_or_filename):
30 |             cached_file = download_cached_file(
31 |                 url_or_filename, check_hash=False, progress=True
32 |             )
33 |             checkpoint = torch.load(cached_file, map_location="cpu")
34 |         elif os.path.isfile(url_or_filename):
35 |             checkpoint = torch.load(url_or_filename, map_location="cpu")
36 |         else:
37 |             raise RuntimeError("checkpoint url or path is invalid")
38 | 
39 |         state_dict = checkpoint["model"]
40 | 
41 |         state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
42 |             state_dict["visual_encoder.pos_embed"], self.visual_encoder
43 |         )
44 |         if "visual_encoder_m.pos_embed" in self.state_dict().keys():
45 |             state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
46 |                 state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m
47 |             )
48 | 
49 |         for key in self.state_dict().keys():
50 |             if key in state_dict.keys():
51 |                 if state_dict[key].shape != self.state_dict()[key].shape:
52 |                     del state_dict[key]
53 | 
54 |         msg = self.load_state_dict(state_dict, strict=False)
55 | 
56 |         logging.info("Missing keys {}".format(msg.missing_keys))
57 |         logging.info("load checkpoint from %s" % url_or_filename)
58 | 
59 |         return msg
60 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from .base_processor import BaseProcessor
 9 | 
10 | from .blip_processors import (
11 |     BlipImageTrainProcessor,
12 |     Blip2ImageTrainProcessor,
13 |     BlipImageEvalProcessor,
14 |     BlipCaptionProcessor,
15 | )
16 | 
17 | from ..common.registry import registry
18 | 
19 | __all__ = [
20 |     "BaseProcessor",
21 |     # BLIP
22 |     "BlipImageTrainProcessor",
23 |     "Blip2ImageTrainProcessor",
24 |     "BlipImageEvalProcessor",
25 |     "BlipCaptionProcessor",
26 | ]
27 | 
28 | 
29 | def load_processor(name, cfg=None):
30 |     """
31 |     Example
32 | 
33 |     >>> processor = load_processor("alpro_video_train", cfg=None)
34 |     """
35 |     processor = registry.get_processor_class(name).from_config(cfg)
36 | 
37 |     return processor
38 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments
2 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or vision_tower.startswith("Lin-Chen"):
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         else:
20 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
21 | 
22 |     def load_model(self):
23 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
24 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
25 |         self.vision_tower.requires_grad_(False)
26 | 
27 |         self.is_loaded = True
28 | 
29 |     def feature_select(self, image_forward_outs):
30 |         image_features = image_forward_outs.hidden_states[self.select_layer]
31 |         if self.select_feature == 'patch':
32 |             image_features = image_features[:, 1:]
33 |         elif self.select_feature == 'cls_patch':
34 |             image_features = image_features
35 |         else:
36 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
37 |         return image_features
38 | 
39 |     @torch.no_grad()
40 |     def forward(self, images):
41 |         if type(images) is list:
42 |             image_features = []
43 |             for image in images:
44 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
45 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
46 |                 image_features.append(image_feature)
47 |         else:
48 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
49 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
50 | 
51 |         return image_features
52 | 
53 |     @property
54 |     def dummy_feature(self):
55 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
56 | 
57 |     @property
58 |     def dtype(self):
59 |         return self.vision_tower.dtype
60 | 
61 |     @property
62 |     def device(self):
63 |         return self.vision_tower.device
64 | 
65 |     @property
66 |     def config(self):
67 |         if self.is_loaded:
68 |             return self.vision_tower.config
69 |         else:
70 |             return self.cfg_only
71 | 
72 |     @property
73 |     def hidden_size(self):
74 |         return self.config.hidden_size
75 | 
76 |     @property
77 |     def num_patches(self):
78 |         return (self.config.image_size // self.config.patch_size) ** 2
79 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM, LlavaConfig


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
23 | 
24 |     def load_model(self):
25 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
26 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
27 |         self.vision_tower.requires_grad_(False)
28 | 
29 |         self.is_loaded = True
30 | 
31 |     def feature_select(self, image_forward_outs):
32 |         image_features = image_forward_outs.hidden_states[self.select_layer]
33 |         if self.select_feature == 'patch':
34 |             image_features = image_features[:, 1:]
35 |         elif self.select_feature == 'cls_patch':
36 |             image_features = image_features
37 |         else:
38 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
39 |         return image_features
40 | 
41 |     @torch.no_grad()
42 |     def forward(self, images):
43 |         if type(images) is list:
44 |             image_features = []
45 |             for image in images:
46 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
47 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
48 |                 image_features.append(image_feature)
49 |         else:
50 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
51 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
52 | 
53 |         return image_features
54 | 
55 |     @property
56 |     def dummy_feature(self):
57 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
58 | 
59 |     @property
60 |     def dtype(self):
61 |         return self.vision_tower.dtype
62 | 
63 |     @property
64 |     def device(self):
65 |         return self.vision_tower.device
66 | 
67 |     @property
68 |     def config(self):
69 |         if self.is_loaded:
70 |             return self.vision_tower.config
71 |         else:
72 |             return self.cfg_only
73 | 
74 |     @property
75 |     def hidden_size(self):
76 |         return self.config.hidden_size
77 | 
78 |     @property
79 |     def num_patches_per_side(self):
80 |         return self.config.image_size // self.config.patch_size
81 | 
82 |     @property
83 |     def num_patches(self):
84 |         return (self.config.image_size // self.config.patch_size) ** 2


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/vqa_model.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import List
 3 | import torch
 4 | 
 5 | from ..model import ScoreModel
 6 | 
 7 | class VQAScoreModel(ScoreModel):
 8 | 
 9 |     @abstractmethod
10 |     def forward(self,
11 |                 images: List[str],
12 |                 texts: List[str],
13 |                 question_template: str,
14 |                 answer_template: str) -> torch.Tensor:
15 |         """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor)
16 |         question_template: a string with optional {} to be replaced with the 'text'
17 |         answer_template: a string with optional {} to be replaced with the 'text'
18 |         """
19 |         pass


--------------------------------------------------------------------------------
/third-party/t2v_metrics/t2v_metrics/vqascore.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .score import Score
 4 | 
 5 | from .constants import HF_CACHE_DIR
 6 | 
 7 | from .models.vqascore_models import list_all_vqascore_models, get_vqascore_model
 8 | 
 9 | class VQAScore(Score):
10 |     def prepare_scoremodel(self,
11 |                            model='clip-flant5-xxl',
12 |                            device='cuda',
13 |                            cache_dir=HF_CACHE_DIR,
14 |                            **kwargs):
15 |         return get_vqascore_model(
16 |             model,
17 |             device=device,
18 |             cache_dir=cache_dir,
19 |             **kwargs
20 |         )
21 |             
22 |     def list_all_models(self) -> List[str]:
23 |         return list_all_vqascore_models()


--------------------------------------------------------------------------------