├── vlmeval
    ├── vlm
    │   ├── ovis
    │   │   ├── utils
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── ola
    │   │   ├── ola
    │   │   │   ├── datasets
    │   │   │   │   └── __init__.py
    │   │   │   ├── model
    │   │   │   │   ├── speech_encoder
    │   │   │   │   │   ├── beats
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── builder.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── speech_projector
    │   │   │   │   │   ├── builder.py
    │   │   │   │   │   └── speech_projector.py
    │   │   │   │   ├── multimodal_encoder
    │   │   │   │   │   └── builder.py
    │   │   │   │   ├── multimodal_resampler
    │   │   │   │   │   └── builder.py
    │   │   │   │   └── multimodal_projector
    │   │   │   │   │   └── pooler_projector.py
    │   │   │   ├── constants.py
    │   │   │   └── arguments.py
    │   │   └── __init__.py
    │   ├── ursa
    │   │   ├── __init__.py
    │   │   └── ursa_model
    │   │   │   ├── __init__.py
    │   │   │   └── processing_ursa.py
    │   ├── valley
    │   │   ├── __init__.py
    │   │   ├── valley_eagle
    │   │   │   ├── util
    │   │   │   │   ├── config.py
    │   │   │   │   └── vision_encoder_config.py
    │   │   │   ├── constants.py
    │   │   │   └── model
    │   │   │   │   ├── multimodal_encoder
    │   │   │   │       └── builder.py
    │   │   │   │   └── token_compressor
    │   │   │   │       ├── avgpool.py
    │   │   │   │       ├── builder.py
    │   │   │   │       └── roipool.py
    │   │   └── requirements_valley.txt
    │   ├── internvl
    │   │   └── __init__.py
    │   ├── qwen2_vl
    │   │   └── __init__.py
    │   ├── llava
    │   │   └── __init__.py
    │   ├── xcomposer
    │   │   └── __init__.py
    │   ├── video_llm
    │   │   ├── __init__.py
    │   │   ├── configs
    │   │   │   ├── llama_vid
    │   │   │   │   └── processor
    │   │   │   │   │   └── clip-patch14-224
    │   │   │   │   │       └── preprocessor_config.json
    │   │   │   └── videochat2_hd.json
    │   │   └── video_chatgpt.py
    │   ├── misc
    │   │   ├── minigptv2_eval.yaml
    │   │   ├── minigpt4_13b_eval.yaml
    │   │   ├── minigpt4_7b_eval.yaml
    │   │   ├── blip2_instruct_vicuna7b.yaml
    │   │   └── blip2_instruct_vicuna13b.yaml
    │   ├── visualglm.py
    │   ├── falcon_vlm.py
    │   ├── chameleon.py
    │   ├── mixsense.py
    │   ├── phi4_multimodal.py
    │   ├── instructblip.py
    │   ├── pandagpt.py
    │   ├── pixtral.py
    │   ├── clip.py
    │   ├── wemm.py
    │   ├── qh_360vl.py
    │   ├── slime.py
    │   └── __init__.py
    ├── dataset
    │   ├── Omnidocbench
    │   │   ├── __init__.py
    │   │   └── requirements.txt
    │   ├── utils
    │   │   ├── megabench
    │   │   │   ├── __init__.py
    │   │   │   ├── parsing
    │   │   │   │   ├── dummy_parse.py
    │   │   │   │   └── json_parse.py
    │   │   │   ├── aggregation
    │   │   │   │   ├── unsupported_agg.py
    │   │   │   │   ├── min_agg.py
    │   │   │   │   └── mean_agg.py
    │   │   │   ├── scoring
    │   │   │   │   ├── unsupported_scoring.py
    │   │   │   │   ├── exact_str_match_case_insensitive.py
    │   │   │   │   ├── set_precision.py
    │   │   │   │   ├── normalized_similarity_damerau_levenshtein.py
    │   │   │   │   ├── longest_common_list_prefix_ratio.py
    │   │   │   │   ├── gleu.py
    │   │   │   │   ├── sacrebleu_bleu.py
    │   │   │   │   ├── nli_entailment.py
    │   │   │   │   ├── number_rel_diff_ratio.py
    │   │   │   │   ├── chess_jaccard.py
    │   │   │   │   ├── near_str_match.py
    │   │   │   │   ├── multi_ref_phrase.py
    │   │   │   │   ├── dict_exact_match_agg_recall.py
    │   │   │   │   ├── dict_nbbox_iou_tuple_agg_jaccard.py
    │   │   │   │   ├── simple_str_match.py
    │   │   │   │   ├── dict_set_equality_agg_jaccard.py
    │   │   │   │   ├── dict_jaccard_agg_jaccard.py
    │   │   │   │   ├── positive_int_match.py
    │   │   │   │   ├── xml_nbbox_iou.py
    │   │   │   │   ├── dict_equality.py
    │   │   │   │   ├── xml_norm_point_in_bbox.py
    │   │   │   │   ├── xml_norm_point_distance.py
    │   │   │   │   ├── exact_str_match.py
    │   │   │   │   ├── mse.py
    │   │   │   │   ├── sequence_equality.py
    │   │   │   │   ├── coordinate_sequence_match.py
    │   │   │   │   ├── jaccard.py
    │   │   │   │   └── set_equality.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── aggregation_type.py
    │   │   │   ├── response_parse_type.py
    │   │   │   └── utils.py
    │   │   ├── __init__.py
    │   │   ├── ccocr_evaluator
    │   │   │   └── __init__.py
    │   │   ├── crpe.py
    │   │   ├── hrbench.py
    │   │   ├── judge_util.py
    │   │   ├── qbench_video.py
    │   │   └── longvideobench.py
    │   ├── autolaporo_maneuver_classification.py
    │   ├── jigsaws_gesture_classification.py
    │   ├── heichole_helpers.py
    │   ├── cholec80_phase_recognition.py
    │   ├── jigsaws_skill_assessment.py
    │   ├── endoscapes_cvs_assessment.py
    │   ├── cholec80_tool_recognition.py
    │   ├── dresden_anatomy_presence.py
    │   ├── emma.py
    │   ├── image_caption.py
    │   ├── avos_action_recognition.py
    │   └── mmgenbench.py
    ├── smp
    │   ├── __init__.py
    │   └── log.py
    ├── utils
    │   ├── __init__.py
    │   ├── matching_util.py
    │   └── mp_util.py
    ├── __init__.py
    └── api
    │   ├── __init__.py
    │   ├── reka.py
    │   ├── glm_vision.py
    │   └── qwen_api.py
├── config
    ├── model
    │   ├── CLIP.yaml
    │   ├── OpenCLIP.yaml
    │   ├── SurgVLP.yaml
    │   ├── InternVL2.yaml
    │   ├── Phi-3.5-Vision.yaml
    │   ├── PaliGemma.yaml
    │   ├── Qwen2-VL.yaml
    │   └── llava_next_vicuna_7b.yaml
    ├── task
    │   ├── cholect45_triplet_recognition.yaml
    │   ├── endoscapes_cvs_assessment_fewshot.yaml
    │   ├── heichole_action_recognition_fewshot.yaml
    │   ├── endoscapes_cvs_assessment.yaml
    │   ├── heichole_action_recognition.yaml
    │   ├── avos_action_recognition.yaml
    │   ├── cholec80_tool_recognition.yaml
    │   ├── heichole_tool_recognition.yaml
    │   ├── heichole_tool_recognition_fewshot.yaml
    │   ├── cholec80_phase_recognition.yaml
    │   ├── heichole_phase_recognition.yaml
    │   ├── heichole_phase_recognition_fewshot.yaml
    │   ├── dresden_anatomy_presence.yaml
    │   └── multibypass140_phase_recognition.yaml
    └── config.yaml
├── docs
    ├── en
    │   ├── docutils.conf
    │   ├── _static
    │   │   ├── js
    │   │   │   └── custom.js
    │   │   ├── css
    │   │   │   └── readthedocs.css
    │   │   └── image
    │   │   │   └── logo_icon.svg
    │   ├── _templates
    │   │   ├── autosummary
    │   │   │   └── class.rst
    │   │   ├── callable.rst
    │   │   └── 404.html
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── index.rst
    │   ├── EvalByLMDeploy.md
    │   └── Contributors.md
    └── zh-CN
    │   ├── docutils.conf
    │   ├── _static
    │       ├── js
    │       │   └── custom.js
    │       ├── css
    │       │   └── readthedocs.css
    │       └── image
    │       │   └── logo_icon.svg
    │   ├── cp_origin_docs.sh
    │   ├── _templates
    │       ├── autosummary
    │       │   └── class.rst
    │       ├── callable.rst
    │       └── 404.html
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── EvalByLMDeploy.md
    │   └── index.rst
├── assets
    └── apple.jpg
├── scripts
    ├── run.sh
    ├── cover.sh
    ├── srun.sh
    ├── auto_run.py
    └── apires_scan.py
├── requirements
    └── docs.txt
├── .github
    ├── workflows
    │   ├── lint.yml
    │   └── pr-run-test.yml
    └── scripts
    │   └── assert_score.py
├── requirements.txt
├── .pre-commit-config.yaml
└── eval.py


/vlmeval/vlm/ovis/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/Omnidocbench/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/__init__.py:
--------------------------------------------------------------------------------
1 | from .ola_model import Ola
2 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_encoder/beats/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/model/CLIP.yaml:
--------------------------------------------------------------------------------
1 | name: CLIP
2 | contrastive: True
3 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/__init__.py:
--------------------------------------------------------------------------------
1 | from .ursa_chat import UrsaChat


--------------------------------------------------------------------------------
/config/model/OpenCLIP.yaml:
--------------------------------------------------------------------------------
1 | name: OpenCLIP
2 | contrastive: True
3 | 


--------------------------------------------------------------------------------
/config/model/SurgVLP.yaml:
--------------------------------------------------------------------------------
1 | name: SurgVLP
2 | contrastive: True
3 | 


--------------------------------------------------------------------------------
/config/model/InternVL2.yaml:
--------------------------------------------------------------------------------
1 | name: InternVL2-8B
2 | contrastive: False
3 | 


--------------------------------------------------------------------------------
/docs/en/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 | 


--------------------------------------------------------------------------------
/config/model/Phi-3.5-Vision.yaml:
--------------------------------------------------------------------------------
1 | name: Phi-3.5-Vision
2 | contrastive: False


--------------------------------------------------------------------------------
/docs/zh-CN/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 | 


--------------------------------------------------------------------------------
/config/model/PaliGemma.yaml:
--------------------------------------------------------------------------------
1 | name: paligemma-3b-mix-448
2 | contrastive: False
3 | 


--------------------------------------------------------------------------------
/config/model/Qwen2-VL.yaml:
--------------------------------------------------------------------------------
1 | name: Qwen2-VL-7B-Instruct
2 | contrastive: False
3 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/__init__.py:
--------------------------------------------------------------------------------
1 | from .valley_eagle_chat import ValleyEagleChat
2 | 


--------------------------------------------------------------------------------
/assets/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anitarau/SurgBenchKit/HEAD/assets/apple.jpg


--------------------------------------------------------------------------------
/config/model/llava_next_vicuna_7b.yaml:
--------------------------------------------------------------------------------
1 | name: llava_next_vicuna_7b
2 | contrastive: False
3 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.ola_qwen import OlaQwenForCausalLM, OlaConfigQwen


--------------------------------------------------------------------------------
/vlmeval/vlm/internvl/__init__.py:
--------------------------------------------------------------------------------
1 | from .internvl_chat import InternVLChat
2 | 
3 | __all__ = ['InternVLChat']
4 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qwen2_vl/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Qwen2VLChat
2 | from .prompt import Qwen2VLPromptMixin
3 | 


--------------------------------------------------------------------------------
/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 | 


--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | export GPU=$(nvidia-smi --list-gpus | wc -l)
4 | torchrun --nproc-per-node=$GPU run.py ${@:1}


--------------------------------------------------------------------------------
/vlmeval/vlm/ovis/__init__.py:
--------------------------------------------------------------------------------
1 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2
2 | 
3 | __all__ = ['Ovis', 'Ovis1_6', 'Ovis1_6_Plus', 'Ovis2']


--------------------------------------------------------------------------------
/scripts/cover.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3 | cp $DIR/../config.py $DIR/../vlmeval/
4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/


--------------------------------------------------------------------------------
/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}


--------------------------------------------------------------------------------
/vlmeval/dataset/Omnidocbench/requirements.txt:
--------------------------------------------------------------------------------
 1 | torchvision
 2 | Levenshtein
 3 | BeautifulSoup4
 4 | pylatexenc
 5 | scipy
 6 | evaluate
 7 | apted
 8 | lxml
 9 | func_timeout
10 | accelerate>=0.26.0
11 | jmespath
12 | qwen_vl_utils
13 | nltk


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/__init__.py:
--------------------------------------------------------------------------------
1 | from .aggregation_type import AggregationType
2 | from .metric_type import MetricType
3 | from .response_parse_type import ResponseParseType
4 | 
5 | __all__ = [AggregationType, MetricType, ResponseParseType]
6 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/parsing/dummy_parse.py:
--------------------------------------------------------------------------------
1 | class DummyParse:
2 | 
3 |     @staticmethod
4 |     def parse(response: str, *args, **kwargs) -> dict:
5 |         """return the raw string without doing anything"""
6 |         return response.strip()
7 | 


--------------------------------------------------------------------------------
/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .matching_util import can_infer, can_infer_option, can_infer_text
2 | from .mp_util import track_progress_rich
3 | 
4 | 
5 | __all__ = [
6 |     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
7 | ]
8 | 


--------------------------------------------------------------------------------
/config/task/cholect45_triplet_recognition.yaml:
--------------------------------------------------------------------------------
1 | # Task 
2 | name: cholect45_triplet_recognition
3 | data: Cholect45Triplet
4 | data_config:
5 |   transform: None
6 |   data_dir: /path/to/CholecT45/
7 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
8 | 


--------------------------------------------------------------------------------
/config/task/endoscapes_cvs_assessment_fewshot.yaml:
--------------------------------------------------------------------------------
1 | # Task 
2 | name: endoscapes_cvs_assessment_fewshot
3 | data: EndoscapesCVSAssessment
4 | data_config:
5 |   transform: None
6 |   data_dir: /path/to/endoscapes/
7 | label_names: ['C1', 'C2', 'C3']
8 | shots: five
9 | 


--------------------------------------------------------------------------------
/docs/en/_static/js/custom.js:
--------------------------------------------------------------------------------
 1 | var collapsedSections = [];
 2 | 
 3 | $(document).ready(function () {
 4 |   $('.model-summary').DataTable({
 5 |     "stateSave": false,
 6 |     "lengthChange": false,
 7 |     "pageLength": 20,
 8 |     "order": []
 9 |   });
10 | });
11 | 


--------------------------------------------------------------------------------
/config/task/heichole_action_recognition_fewshot.yaml:
--------------------------------------------------------------------------------
1 | # Task 
2 | name: heichole_action_recognition_fewshot
3 | data: HeiCholeDataloader
4 | data_config:
5 |   transform: None
6 |   data_dir: /path/to/heichole
7 | label_names: ['grasp', 'hold', 'cut', 'clip']
8 | shots: one
9 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/js/custom.js:
--------------------------------------------------------------------------------
 1 | var collapsedSections = [];
 2 | 
 3 | $(document).ready(function () {
 4 |   $('.model-summary').DataTable({
 5 |     "stateSave": false,
 6 |     "lengthChange": false,
 7 |     "pageLength": 20,
 8 |     "order": []
 9 |   });
10 | });
11 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
2 | from .llava_xtuner import LLaVA_XTuner
3 | 
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
5 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/unsupported_agg.py:
--------------------------------------------------------------------------------
1 | from numbers import Number
2 | from typing import Dict
3 | 
4 | 
5 | class UnsupportedAggregation:
6 |     @staticmethod
7 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
8 |         return -1
9 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/unsupported_scoring.py:
--------------------------------------------------------------------------------
1 | class UnsupportedScoring:
2 |     """Unsupported scoring."""
3 | 
4 |     @staticmethod
5 |     def match(response: str, correct_answer: str) -> int:
6 |         """Default response for unimplemented metrics."""
7 |         return -1
8 | 


--------------------------------------------------------------------------------
/docs/zh-CN/cp_origin_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copy *.md files from docs/ if it doesn't have a Chinese translation
 4 | 
 5 | for filename in $(find ../en/ -name '*.md' -printf "%P\n");
 6 | do
 7 |     mkdir -p $(dirname $filename)
 8 |     cp -n ../en/$filename ./$filename
 9 | done
10 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
 1 | docutils==0.18.1
 2 | modelindex
 3 | myst-parser
 4 | -e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 5 | sphinx==6.1.3
 6 | sphinx-copybutton
 7 | sphinx-design
 8 | sphinx-notfound-page
 9 | sphinx-tabs
10 | sphinxcontrib-jquery
11 | tabulate
12 | 


--------------------------------------------------------------------------------
/config/task/endoscapes_cvs_assessment.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: endoscapes_cvs_assessment
 3 | data: EndoscapesCVSAssessment
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/endoscapes/
 7 | label_names: ['C1', 'C2', 'C3']
 8 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/vlmeval/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | except ImportError:
 4 |     pass
 5 | 
 6 | from .smp import *
 7 | from .api import *
 8 | from .dataset import *
 9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | from .tools import cli
13 | 
14 | load_env()
15 | 
16 | __version__ = '0.2rc1'
17 | 


--------------------------------------------------------------------------------
/config/task/heichole_action_recognition.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: heichole_action_recognition
 3 | data: HeiCholeDataloader
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/heichole
 7 | label_names: ['grasp', 'hold', 'cut', 'clip']
 8 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 | from .xcomposer2d5 import XComposer2d5
6 | 
7 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
8 | 


--------------------------------------------------------------------------------
/docs/en/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | ..
12 |   autogenerated from _templates/autosummary/class.rst
13 |   note it does not have :inherited-members:
14 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | ..
12 |   autogenerated from _templates/autosummary/class.rst
13 |   note it does not have :inherited-members:
14 | 


--------------------------------------------------------------------------------
/config/task/avos_action_recognition.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: avos_action_recognition
 3 | data: AVOSActionRecognition
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/AVOS
 7 | label_names: ['cutting', 'tying', 'suturing', 'background']
 8 | clip_eval_mode: 'singlelabel'  # single label multi-class classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-22.04
 6 |   tools:
 7 |     python: "3.8"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/en/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 | 


--------------------------------------------------------------------------------
/docs/en/_templates/callable.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 |     :special-members: __call__
11 | 
12 | ..
13 |   autogenerated from _templates/callable.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/docs/zh-CN/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-22.04
 6 |   tools:
 7 |     python: "3.8"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/zh-CN/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/callable.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 |     :special-members: __call__
11 | 
12 | ..
13 |   autogenerated from _templates/callable.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .judge_util import build_judge, DEBUG_MESSAGE
 2 | from .multiple_choice import extract_answer_from_item, prefetch_answer
 3 | from .vqa_eval import levenshtein_distance
 4 | 
 5 | 
 6 | __all__ = [
 7 |     'build_judge', 'extract_answer_from_item', 'prefetch_answer',
 8 |     'levenshtein_distance', 'DEBUG_MESSAGE',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/requirements.txt:
--------------------------------------------------------------------------------
 1 | antlr4-python3-runtime==4.11.0
 2 | filelock==3.16.1
 3 | geopy==2.4.1
 4 | jieba==0.42.1
 5 | nltk==3.9.1
 6 | numpy==1.26.4
 7 | pronouncing==0.2.0
 8 | rapidfuzz==3.9.5
 9 | regex==2024.7.24
10 | requests==2.32.3
11 | requests_cache==1.2.1
12 | sacrebleu==2.4.3
13 | sympy==1.13.2
14 | tqdm==4.66.4
15 | Unidecode==1.3.8
16 | 


--------------------------------------------------------------------------------
/config/task/cholec80_tool_recognition.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: cholec80_tool_recognition
 3 | data: Cholec80ToolRecognition
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/cholec80
 7 | label_names: ['Grasper', 'Bipolar', 'Hook', 'Scissors', 'Clipper', 'Irrigator', 'SpecimenBag']
 8 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .video_llava import VideoLLaVA, VideoLLaVA_HF
2 | from .videochat2 import VideoChat2_HD
3 | from .chat_uni_vi import Chatunivi
4 | from .video_chatgpt import VideoChatGPT
5 | from .llama_vid import LLaMAVID
6 | from .pllava import PLLaVA
7 | 
8 | __all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA']
9 | 


--------------------------------------------------------------------------------
/config/task/heichole_tool_recognition.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: heichole_tool_recognition
 3 | data: HeiCholeDataloader
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/heichole
 7 | label_names: ['Grasper', 'Clipper', 'Coagulation instruments', 'Scissors', 'Suction-irrigation', 'Specimen bag', 'Stapler']
 8 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_projector/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_projector import EncoderProjectorConcat
 2 | 
 3 | 
 4 | def build_speech_projector(config):
 5 |     projector_type = getattr(config, 'speech_projector_type', 'linear')
 6 |     if projector_type == 'linear':
 7 |         return EncoderProjectorConcat(config)
 8 | 
 9 |     raise ValueError(f'Unknown projector type: {projector_type}')
10 | 


--------------------------------------------------------------------------------
/config/task/heichole_tool_recognition_fewshot.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: heichole_tool_recognition_fewshot
 3 | data: HeiCholeDataloader
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/heichole
 7 | label_names: ['Grasper', 'Clipper', 'Coagulation instruments', 'Scissors', 'Suction-irrigation', 'Specimen bag', 'Stapler']
 8 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
 9 | shots: one
10 | 


--------------------------------------------------------------------------------
/config/task/cholec80_phase_recognition.yaml:
--------------------------------------------------------------------------------
1 | # Task 
2 | name: cholec80_phase_recognition
3 | data: Cholec80PhaseRecognition
4 | data_config:
5 |   data_dir: /path/to/cholec80
6 | label_names: ['Preparation', 'CalotTriangleDissection', 'ClippingCutting', 'GallbladderDissection', 'GallbladderPackaging', 'CleaningCoagulation', 'GallbladderRetraction']
7 | clip_eval_mode: 'singlelabel'  # single label multi-class classification
8 | shots: zero


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | SPEECH_TOKEN_INDEX = -200
 9 | DEFAULT_SPEECH_TOKEN = "<speech>"
10 | IMAGE_TOKEN_INDEX= -300
11 | DEFAULT_IMAGE_TOKEN = "<image>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"


--------------------------------------------------------------------------------
/config/task/heichole_phase_recognition.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: heichole_phase_recognition
 3 | data: HeiCholeDataloader
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/heichole
 7 | label_names: ['Preparation', 'CalotTriangleDissection', 'ClippingCutting', 'GallbladderDissection', 'GallbladderPackaging', 'CleaningCoagulation', 'GallbladderRetraction']
 8 | clip_eval_mode: 'singlelabel'  # single label multi-class classification
 9 | shots: zero
10 | 


--------------------------------------------------------------------------------
/config/task/heichole_phase_recognition_fewshot.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: heichole_phase_recognition_fewshot
 3 | data: HeiCholeDataloader
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to/heichole
 7 | label_names: ['Preparation', 'CalotTriangleDissection', 'ClippingCutting', 'GallbladderDissection', 'GallbladderPackaging', 'CleaningCoagulation', 'GallbladderRetraction']
 8 | clip_eval_mode: 'singlelabel'  # single label multi-class classification
 9 | shots: one
10 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/ccocr_evaluator/__init__.py:
--------------------------------------------------------------------------------
 1 | from .kie_evaluator import KieEvaluator
 2 | from .doc_parsing_evaluator import ParsingEvaluator
 3 | from .ocr_evaluator import OcrEvaluator
 4 | from .common import summary
 5 | 
 6 | 
 7 | evaluator_map_info = {
 8 |     "kie": KieEvaluator("kie"),
 9 |     "doc_parsing": ParsingEvaluator("doc_parsing"),
10 |     "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
11 |     "multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
12 | }
13 | 


--------------------------------------------------------------------------------
/config/task/dresden_anatomy_presence.yaml:
--------------------------------------------------------------------------------
 1 | # Task 
 2 | name: dresden_anatomy_presence
 3 | data: DresdenAnatomyPresence
 4 | data_config:
 5 |   transform: None
 6 |   data_dir: /path/to//DresdenSurgicalAnatomy
 7 | label_names: ['abdominal wall', 'colon', 'inferior mesenteric artery', 'intestinal veins', 'liver', 'pancreas',
 8 |               'small intestine', 'spleen', 'stomach', 'ureter', 'null', 'vesicular glands']
 9 | clip_eval_mode: 'sigmoid'  # multi-label binary classification
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": 224,
 3 |   "do_center_crop": true,
 4 |   "do_normalize": true,
 5 |   "do_resize": true,
 6 |   "feature_extractor_type": "CLIPFeatureExtractor",
 7 |   "image_mean": [
 8 |     0.48145466,
 9 |     0.4578275,
10 |     0.40821073
11 |   ],
12 |   "image_std": [
13 |     0.26862954,
14 |     0.26130258,
15 |     0.27577711
16 |   ],
17 |   "resample": 3,
18 |   "size": 224
19 | }
20 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: GeminiPro1-5
 4 |   - task: dresden_anatomy_presence
 5 |   - override hydra/hydra_logging: disabled  
 6 |   - override hydra/job_logging: disabled  
 7 | 
 8 | # Set model and task
 9 | workdir: /pasteur/u/arau/projects/surg_bench/check_for_pub/
10 | exp_name: deleteme_for_pub
11 | eval_mode: infer_data
12 | 
13 | override_outputs: False
14 | 
15 | #prevent hydra outputs  
16 | hydra:  
17 |   output_subdir: null  
18 |   run:  
19 |     dir: .


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/crpe.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def is_correct(predict, answer):
 7 |     # predict是标准答案 answer是预测
 8 |     if len(answer) == 1:
 9 |         return answer[0] == predict[0]
10 |     elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
11 |         return answer[0] == predict[0]
12 |     elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
13 |         return predict[4:].lower() in answer.lower()
14 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_encoder import WhisperWrappedEncoder, DualWrappedEncoder
 2 | 
 3 | 
 4 | def build_speech_encoder(config):
 5 |     speech_encoder_type = getattr(config, 'speech_encoder_type', None)
 6 |     if "whisper" in speech_encoder_type.lower():
 7 |         return WhisperWrappedEncoder.load(config)
 8 |     elif "dual" in speech_encoder_type.lower():
 9 |         return DualWrappedEncoder(config)
10 | 
11 |     raise ValueError(f'Unknown speech encoder: {speech_encoder_type}')
12 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/min_agg.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from typing import Dict
 3 | 
 4 | 
 5 | class MinAggregation:
 6 |     """Take the minimum of all valid scores."""
 7 | 
 8 |     @staticmethod
 9 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
10 |         """Exact match between targets and responses."""
11 |         filtered_scores = [s for s in scores.values() if s >= 0]
12 |         if not filtered_scores:
13 |             return -1
14 |         return min(filtered_scores)
15 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .oryx_vit import SigLIPViTAnysizeWrapper
3 | 
4 | def build_vision_tower(vision_tower_cfg, **kwargs):
5 |     vision_tower = getattr(vision_tower_cfg, 'vision_tower', getattr(vision_tower_cfg, 'mm_vision_tower', None))
6 |     is_absolute_path_exists = os.path.exists(vision_tower)
7 |     print(f"Buiding OryxViTWrapper from {vision_tower}...")
8 |     # path = vision_tower.split(":")[1]
9 |     return SigLIPViTAnysizeWrapper(vision_tower, path=vision_tower, args=vision_tower_cfg, **kwargs)


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/exact_str_match_case_insensitive.py:
--------------------------------------------------------------------------------
 1 | from .exact_str_match import ExactStrMatch
 2 | 
 3 | 
 4 | class ExactStrMatchCaseInsensitive:
 5 |     """Case-insensitive exact string matching."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer) -> int:
 9 |         """Case-insensitive exact match between targets and responses."""
10 |         if not isinstance(response, str) and isinstance(correct_answer, str):
11 |             return 0
12 |         return ExactStrMatch.match(response.lower(), correct_answer.lower())
13 | 


--------------------------------------------------------------------------------
/docs/en/_templates/404.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | 
 3 | {% block body %}
 4 | 
 5 | <h1>Page Not Found</h1>
 6 | <p>
 7 |   The page you are looking for cannot be found.
 8 | </p>
 9 | <p>
10 |   If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 |   the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12 | </p>
13 | <!-- <p>
14 |   If you cannot find documentation you want, please <a
15 |     href="">open an issue</a> to tell us!
16 | </p> -->
17 | 
18 | {% endblock %}
19 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/404.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | 
 3 | {% block body %}
 4 | 
 5 | <h1>Page Not Found</h1>
 6 | <p>
 7 |   The page you are looking for cannot be found.
 8 | </p>
 9 | <p>
10 |   If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 |   the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12 | </p>
13 | <!-- <p>
14 |   If you cannot find documentation you want, please <a
15 |     href="">open an issue</a> to tell us!
16 | </p> -->
17 | 
18 | {% endblock %}
19 | 


--------------------------------------------------------------------------------
/config/task/multibypass140_phase_recognition.yaml:
--------------------------------------------------------------------------------
1 | # Task 
2 | name: multibypass140_phase_recognition
3 | data: MultiBypass140PhaseRecognition
4 | data_config:
5 |   transform: None
6 |   data_dir: /path/to/MultiBypass140/
7 | label_names: ['Preparation', 'Gastric pouch creation', 'Omentum division', 'Gastrojejunal anastomosis', 'Anastomosis test', 'Jejunal separation', 'Petersen space closure', 'Jejunojejunal anastomosis', 'Mesenteric defect closure', 'Cleaning & Coagulation', 'Disassembling', 'Other intervention']
8 | clip_eval_mode: 'singlelabel'  # single label multi-class classification
9 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/set_precision.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set
 2 | from .common.metrics import set_precision
 3 | 
 4 | 
 5 | class SetPrecision:
 6 |     """Calculates the set precision for iterables."""
 7 | 
 8 |     @classmethod
 9 |     def match(cls, responses, targets) -> float:
10 |         """Exact match between targets and responses."""
11 |         if responses is None:
12 |             return 0
13 |         responses = cast_to_set(responses)
14 |         targets = cast_to_set(targets)
15 | 
16 |         return set_precision(responses, targets)
17 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/normalized_similarity_damerau_levenshtein.py:
--------------------------------------------------------------------------------
 1 | import rapidfuzz
 2 | 
 3 | 
 4 | class NormalizedSimilarityDamerauLevenshtein:
 5 |     """Normalized Damerau-Levenshtein Similarity."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer) -> int:
 9 |         """Normalized indel similarityuiio do between targets and responses."""
10 |         if not isinstance(response, str) and isinstance(correct_answer, str):
11 |             return 0
12 |         return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity(
13 |             response, correct_answer
14 |         )
15 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.10
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.10.15
18 |       - name: Install pre-commit hook
19 |         run: |
20 |           pip install pre-commit
21 |           pre-commit install
22 |       - name: Linting
23 |         run: pre-commit run --all-files
24 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/longest_common_list_prefix_ratio.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import str_to_list
 2 | from .common.metrics import longest_common_prefix
 3 | 
 4 | 
 5 | class LongestCommonListPrefixRatio:
 6 |     """Determines how much of the first part of the list
 7 |     was predicted correctly.
 8 |     """
 9 | 
10 |     @classmethod
11 |     def match(cls, responses, targets) -> int:
12 |         """Exact match between targets and responses."""
13 |         responses = str_to_list(responses)
14 |         targets = str_to_list(targets)
15 |         return len(longest_common_prefix(responses, targets)) / len(targets)
16 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/parsing/json_parse.py:
--------------------------------------------------------------------------------
 1 | from .common.parsers import parse_json
 2 | from .common.utils import evaluate_as_string
 3 | 
 4 | 
 5 | class JsonParse:
 6 |     """Load the response as a JSON object."""
 7 | 
 8 |     @staticmethod
 9 |     def parse(response: str):
10 |         """Parse the JSON object, including nested JSON strings."""
11 |         parsed_res = parse_json(response)
12 |         # Drop the potentially duplicated string quotes
13 |         if isinstance(parsed_res, dict):
14 |             for key, val in parsed_res.items():
15 |                 parsed_res[key] = evaluate_as_string(val)
16 | 
17 |         return parsed_res
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | decord; platform_machine != 'arm64'
 2 | eva-decord; platform_machine == 'arm64'
 3 | gradio
 4 | huggingface_hub
 5 | imageio
 6 | matplotlib
 7 | numpy==1.26.4
 8 | omegaconf
 9 | openai
10 | opencv-python>=4.4.0.46
11 | openpyxl
12 | pandas
13 | pillow
14 | portalocker
15 | protobuf
16 | python-dotenv
17 | requests
18 | rich
19 | sentencepiece
20 | setuptools
21 | sty
22 | tabulate
23 | tiktoken
24 | timeout-decorator
25 | torch
26 | torchvision
27 | tqdm
28 | transformers
29 | typing_extensions
30 | validators
31 | xlsxwriter
32 | ftfy
33 | regex
34 | pandas
35 | decord
36 | scikit-learn
37 | hydra-core
38 | google-generativeai
39 | flash-attn==2.6.3
40 | qwen-vl-utils
41 | open_clip_torch
42 | clip


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/gleu.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | import jieba
 3 | from nltk.translate.gleu_score import sentence_gleu
 4 | 
 5 | 
 6 | class GLEUChinese:
 7 |     """Compute GLEU score for Chinese text."""
 8 | 
 9 |     @staticmethod
10 |     def match(response, correct_answer) -> Number:
11 |         """Compute the BLEU scores between two strings."""
12 |         if isinstance(response, str) and isinstance(correct_answer, str):
13 |             reference_tokens = list(jieba.cut_for_search(response))
14 |             translation_tokens = list(jieba.cut_for_search(correct_answer))
15 |         else:
16 |             return 0
17 |         return sentence_gleu([reference_tokens], translation_tokens)
18 | 


--------------------------------------------------------------------------------
/docs/en/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/zh-CN/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/util/config.py:
--------------------------------------------------------------------------------
 1 | IGNORE_INDEX = -100
 2 | IMAGE_TOKEN_INDEX = -200
 3 | GANDALF_TOKEN_INDEX = -300
 4 | DEFAULT_PAD_TOKEN = "[PAD]"
 5 | DEFAULT_EOS_TOKEN = "</s>"
 6 | DEFAULT_BOS_TOKEN = "</s>"
 7 | DEFAULT_UNK_TOKEN = "<unk>"
 8 | DEFAULT_IMAGE_TOKEN = "<image>"
 9 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
10 | DEFAULT_IM_START_TOKEN = "<im_start>"
11 | DEFAULT_IM_END_TOKEN = "<im_end>"
12 | 
13 | DEFAULT_VIDEO_TOKEN = "<video>"
14 | DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
15 | DEFAULT_VI_START_TOKEN = "<vi_start>"
16 | DEFAULT_VI_END_TOKEN = "<vi_end>"
17 | DEFAULT_GANDALF_TOKEN = "<gandalf>"
18 | 
19 | DEFAULT_EOC_TOKEN = "<eoc>"
20 | COR_START_TOKEN = "<cor>"
21 | COR_END_TOKEN = "<\cor>"  # noqa
22 | 
23 | SEQ_MAX_LEN = 50000
24 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/requirements_valley.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.34.2
 2 | bert-score==0.3.13
 3 | byted-wandb==0.13.72
 4 | datasets==2.21.0
 5 | decord==0.6.0
 6 | einops==0.8.0
 7 | evaluate==0.4.3
 8 | fastapi==0.115.0
 9 | flash_attn
10 | ftfy==6.2.3
11 | markdown2==2.5.0
12 | ninja==1.11.1.1
13 | nltk==3.9.1
14 | numpy==1.26.4
15 | omegaconf==2.3.0
16 | openai==0.28
17 | opencv-python-headless==4.10.0.84
18 | packaging==24.1
19 | pandas==2.2.2
20 | peft==0.5.0
21 | prettytable==3.11.0
22 | protobuf==3.20.3
23 | pyarrow==15.0.0
24 | pydantic==1.10.14
25 | qwen_vl_utils
26 | requests==2.32.3
27 | rouge-score==0.1.2
28 | scikit-image==0.24.0
29 | scikit-learn==1.5.2
30 | sentencepiece==0.1.97
31 | timm==0.6.7
32 | tokenizers>=0.13.3
33 | torchmetrics
34 | transformers==4.45.2
35 | uvicorn==0.30.6
36 | 


--------------------------------------------------------------------------------
/docs/zh-CN/EvalByLMDeploy.md:
--------------------------------------------------------------------------------
 1 | # 使用 LMDeploy 加速评测推理
 2 | 
 3 | VLMEvalKit 支持测试由 LMDeploy 部署的 VLM 模型，下面以 InternVL2-8B 为例，展示如何测试模型
 4 | 
 5 | ## 第0步 安装 LMDeploy
 6 | 
 7 | ```bash
 8 | pip install lmdeploy
 9 | ```
10 | 
11 | 其他安装方式可以参考 LMDeploy 的[文档](https://github.com/InternLM/lmdeploy)
12 | 
13 | ## 第1步 启动推理服务
14 | 
15 | ```bash
16 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B
17 | ```
18 | > [!IMPORTANT]
19 | > 因为 VLMEvalKit 中的模型对于不同数据集在构建 prompt 时可能有自定义行为，如 InternVL2 对于 HallusionBench 的处理，所以，server 端在启动的时候需要指定 `--model-name`，这样在使用 LMDEploy api 时可以根据名字选择合适的 prompt 构建策略。
20 | >
21 | > 如果指定了 `--server-port`，需要设置对应的环境变量 `LMDEPLOY_API_BASE`
22 | 
23 | 
24 | ## 第2步 评测
25 | 
26 | ```bash
27 | python run.py --data MMStar --model InternVL2-8B --verbose --api-nproc 64
28 | ```
29 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | class AggregationType(Enum):
 4 |     MEAN = 0
 5 | 
 6 |     @classmethod
 7 |     def from_string(cls, s):
 8 |         return cls.MEAN
 9 | 
10 |     def aggregate(self, field_scores, field_weights):
11 |         if not field_scores:
12 |             return 0.0
13 | 
14 |         total_score = 0.0
15 |         total_weight = 0.0
16 | 
17 |         for field, score in field_scores.items():
18 |             weight = field_weights.get(field, 1.0)
19 |             try:
20 |                 total_score += score * weight
21 |             except:
22 |                 total_score += score[0] * weight
23 |             total_weight += weight
24 | 
25 |         return total_score / total_weight if total_weight > 0 else 0.0
26 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | CONV_SEP = "###"
 4 | LOGDIR = "./valley/serve/serve_logs"
 5 | SERVE_IMAGE = "./valley/serve/serve_logs"
 6 | SHELL_UI_HEADER = '''
 7 | 
 8 | ██╗   ██╗ █████╗ ██╗     ██╗     ███████╗██╗   ██╗     ██████╗██╗  ██╗ █████╗ ████████╗
 9 | ██║   ██║██╔══██╗██║     ██║     ██╔════╝╚██╗ ██╔╝    ██╔════╝██║  ██║██╔══██╗╚══██╔══╝
10 | ██║   ██║███████║██║     ██║     █████╗   ╚████╔╝     ██║     ███████║███████║   ██║
11 | ╚██╗ ██╔╝██╔══██║██║     ██║     ██╔══╝    ╚██╔╝      ██║     ██╔══██║██╔══██║   ██║
12 |  ╚████╔╝ ██║  ██║███████╗███████╗███████╗   ██║       ╚██████╗██║  ██║██║  ██║   ██║
13 |   ╚═══╝  ╚═╝  ╚═╝╚══════╝╚══════╝╚══════╝   ╚═╝        ╚═════╝╚═╝  ╚═╝╚═╝  ╚═╝   ╚═╝
14 | 
15 | '''
16 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/sacrebleu_bleu.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | import sacrebleu
 3 | 
 4 | 
 5 | class Bleu:
 6 |     """Compute BLEU score, using SacreBLEU."""
 7 | 
 8 |     @staticmethod
 9 |     def match(response, correct_answer) -> Number:
10 |         """Compute the BLEU scores between two strings."""
11 |         if isinstance(response, str) and isinstance(correct_answer, str):
12 |             resp = [response]
13 |             corr = [correct_answer]
14 |         elif isinstance(response, (list, tuple)) and isinstance(
15 |             correct_answer, (list, tuple)
16 |         ):
17 |             resp = tuple(response)
18 |             corr = tuple(correct_answer)
19 |         else:
20 |             return 0
21 |         result = sacrebleu.corpus_bleu(corr, [resp]).score / 100
22 |         return result
23 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/nli_entailment.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import pipeline
 3 | 
 4 | 
 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 6 | pipe = pipeline(
 7 |     "text-classification", model="microsoft/deberta-large-mnli", device=device
 8 | )
 9 | 
10 | 
11 | class NliEntailment:
12 |     """NLI entailment, where the correct answer is used as the premise."""
13 | 
14 |     @staticmethod
15 |     def match(response, correct_answer) -> int:
16 |         """Return whether the response and correct answer agree with each other."""
17 |         if not isinstance(response, str) or isinstance(correct_answer, str):
18 |             return 0
19 |         resp = pipe(f"[CLS] {correct_answer.strip()} [SEP] {response.strip()} [SEP]")
20 |         return 1 if resp[0]["label"] == "ENTAILMENT" else 0
21 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/number_rel_diff_ratio.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import math
 3 | from numbers import Number
 4 | 
 5 | 
 6 | class NumberRelDiffRatio:
 7 |     """Number relative difference ratio scoring = min(0, 1 - |pred - gt| / gt)"""
 8 | 
 9 |     @staticmethod
10 |     def match(response: str | Number, correct_answer: str) -> int:
11 |         """Return the relative difference ratio."""
12 |         try:
13 |             if isinstance(response, Number):
14 |                 pred = response
15 |             else:
16 |                 pred = ast.literal_eval(response)
17 |             if not isinstance(pred, Number):
18 |                 return 0
19 |             gt = ast.literal_eval(correct_answer)
20 |             return max(0, 1 - math.fabs((pred - gt) / gt))
21 |         except (SyntaxError, ValueError):
22 |             return 0
23 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .perceiver import DynamicCompressor
 4 | 
 5 | class IdentityMap(torch.nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, x, *args, **kwargs):
10 |         return x
11 | 
12 |     @property
13 |     def config(self):
14 |         return {"mm_resampler_type": None}
15 | 
16 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
17 |     # import pdb;pdb.set_trace()
18 |     resampler_type = getattr(model_args, 'mm_resampler_type', None)
19 |     if resampler_type == 'dynamic_compressor':
20 |         return DynamicCompressor(model_args, **kwargs)
21 |     elif resampler_type is None:
22 |         return IdentityMap()
23 |     else:
24 |         raise ValueError(f'Unknown resampler type: {resampler_type}')
25 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/mean_agg.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from typing import Dict
 3 | import numpy as np
 4 | 
 5 | 
 6 | class MeanAggregation:
 7 |     """Take the mean of all valid scores."""
 8 | 
 9 |     @staticmethod
10 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
11 |         """Exact match between targets and responses."""
12 |         filtered_scores = {f: s for f, s in scores.items() if s >= 0}
13 |         if not filtered_scores:
14 |             return -1
15 | 
16 |         # Align the key order
17 |         flattened_scores = []
18 |         flattened_weights = []
19 |         for field in filtered_scores:
20 |             flattened_scores.append(filtered_scores[field])
21 |             flattened_weights.append(weights[field])
22 |         return np.average(flattened_scores, weights=flattened_weights)
23 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt_v2
 3 |   model_type: pretrain
 4 |   max_txt_len: 160
 5 |   end_sym: "</s>"
 6 |   low_resource: True
 7 |   prompt_template: '[INST] {} [/INST]'
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 |   lora_r: 64
10 |   lora_alpha: 16
11 | 
12 |   # vit encoder
13 |   image_size: 448
14 |   drop_path_rate: 0
15 |   use_grad_checkpoint: False
16 |   vit_precision: "fp16"
17 |   freeze_vit: True
18 | 
19 |   # generation configs
20 |   prompt: ""
21 | 
22 |   # LLM
23 |   llama_model: "please set this value to the path of llama2-chat-7b"
24 | 
25 | datasets:
26 |   cc_sbu_align:
27 |     vis_processor:
28 |       train:
29 |         name: "blip2_image_eval"
30 |         image_size: 448
31 |     text_processor:
32 |       train:
33 |         name: "blip_caption"
34 | 
35 | run:
36 |   task: image_text_pretrain
37 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-13b-v0"
25 | 
26 | datasets:
27 |   cc_sbu_align:
28 |     vis_processor:
29 |       train:
30 |         name: "blip2_image_eval"
31 |         image_size: 224
32 |     text_processor:
33 |       train:
34 |         name: "blip_caption"
35 | 
36 | run:
37 |   task: image_text_pretrain
38 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-7b-v0"
25 | 
26 | 
27 | datasets:
28 |   cc_sbu_align:
29 |     vis_processor:
30 |       train:
31 |         name: "blip2_image_eval"
32 |         image_size: 224
33 |     text_processor:
34 |       train:
35 |         name: "blip_caption"
36 | 
37 | run:
38 |   task: image_text_pretrain
39 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/chess_jaccard.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict, Any
 3 | from .common.conversions import str_to_set
 4 | from .common.metrics import jaccard_index
 5 | 
 6 | 
 7 | def chess_transform(move_sequence: str) -> set:
 8 |     """Transform a sequence of chess moves encoded in SAN into a set."""
 9 |     move_sequence = str_to_set(move_sequence)
10 |     return {move_san.removesuffix("!").removesuffix("#") for move_san in move_sequence}
11 | 
12 | 
13 | class ChessMoveJaccard:
14 |     """Calculates the Jacard index for chess moves."""
15 | 
16 |     @classmethod
17 |     def match(cls, responses: str | None, targets: str) -> float:
18 |         """Exact match between targets and responses."""
19 |         if responses is None:
20 |             return 0
21 |         responses = chess_transform(responses)
22 |         targets = chess_transform(targets)
23 | 
24 |         return jaccard_index(responses, targets)
25 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/near_str_match.py:
--------------------------------------------------------------------------------
 1 | import rapidfuzz
 2 | import unidecode
 3 | from .common.transformations import remove_def_indef_articles
 4 | 
 5 | 
 6 | def approximate(text: str) -> str:
 7 |     """Return an approximation of the original string."""
 8 |     return unidecode.unidecode(remove_def_indef_articles(text)).lower()
 9 | 
10 | 
11 | class NearStrMatch:
12 |     """Near string matching."""
13 | 
14 |     @staticmethod
15 |     def match(response, correct_answer: str, threshold=0.9) -> int:
16 |         """Simple string match between response and correct_answer."""
17 |         if not isinstance(response, str) or not isinstance(correct_answer, str):
18 |             return 0
19 |         response = approximate(response)
20 |         correct_answer = approximate(correct_answer)
21 |         return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity(
22 |             response, correct_answer, score_cutoff=threshold
23 |         )
24 | 


--------------------------------------------------------------------------------
/docs/zh-CN/index.rst:
--------------------------------------------------------------------------------
 1 | 欢迎来到 VLMEvalKit 中文教程！
 2 | ==========================================
 3 | 
 4 | VLMEvalKit 上手路线
 5 | -------------------------------
 6 | 
 7 | 为了用户能够快速上手，我们推荐以下流程：
 8 | 
 9 | - 对于想要使用 VLMEvalKit 的用户，我们推荐先阅读 开始你的第一步_ 部分来设置环境，并启动一个迷你实验熟悉流程。
10 | 
11 | - 若您想进行更多模块的自定义，例如增加数据集和模型，我们提供了 进阶教程_ 。
12 | 
13 | 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit！
14 | 
15 | .. _快速开始:
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: 快速开始
19 | 
20 |    Quickstart.md
21 | 
22 | 
23 | .. .. _教程:
24 | .. .. toctree::
25 | ..    :maxdepth: 1
26 | ..    :caption: 教程
27 | 
28 | ..    user_guides/framework_overview.md
29 | 
30 | .. _进阶教程:
31 | .. toctree::
32 |    :maxdepth: 1
33 |    :caption: 进阶教程
34 | 
35 |    Development.md
36 |    ConfigSystem.md
37 | 
38 | .. .. _其他说明:
39 | .. .. toctree::
40 | ..    :maxdepth: 1
41 | ..    :caption: 其他说明
42 | 
43 | ..    notes/contribution_guide.md
44 | 
45 | 索引与表格
46 | ==================
47 | 
48 | * :ref:`genindex`
49 | * :ref:`search`
50 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/multi_ref_phrase.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from .common.conversions import str_to_iterable
 3 | from .simple_str_match import SimpleStrMatch
 4 | 
 5 | 
 6 | def replace_potential_chinese_comma(input_string):
 7 |     return input_string.replace("，", ",")
 8 | 
 9 | 
10 | class MultipleReferencePhraseEval:
11 |     """
12 |     Check the response with multiple correct references
13 |     As long as one is matched, the score is 1, otherwise the score is 0
14 |     """
15 | 
16 |     @staticmethod
17 |     def match(response, targets) -> Number:
18 |         targets = replace_potential_chinese_comma(targets)
19 |         refs = str_to_iterable(list, targets)
20 |         matched = False
21 |         for ref in refs:
22 |             str_ref = ref if isinstance(ref, str) else str(ref)
23 |             if SimpleStrMatch.match(response, str_ref):
24 |                 matched = True
25 |                 break
26 |         return 1 if matched else 0
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: |
 2 |   (?x)^(
 3 |       scripts/|
 4 |       assets/|
 5 |       vlmeval/config.py |
 6 |       vlmeval/dataset/utils/wemath.py |
 7 |   )
 8 | repos:
 9 |   - repo: https://github.com/PyCQA/flake8
10 |     rev: 6.1.0
11 |     hooks:
12 |       - id: flake8
13 |         args: ["--max-line-length=120", "--ignore=F401,F403,F405,E402,E722,E741,W503,E231,E702"]
14 |         exclude: ^configs/
15 |   - repo: https://github.com/pre-commit/mirrors-yapf
16 |     rev: v0.30.0
17 |     hooks:
18 |       - id: yapf
19 |         args: ["--style={column_limit=120}"]
20 |   - repo: https://github.com/pre-commit/pre-commit-hooks
21 |     rev: v3.1.0
22 |     hooks:
23 |       - id: trailing-whitespace
24 |       - id: check-yaml
25 |       - id: end-of-file-fixer
26 |       - id: requirements-txt-fixer
27 |       - id: check-merge-conflict
28 |       - id: fix-encoding-pragma
29 |         args: ["--remove"]
30 |       - id: mixed-line-ending
31 |         args: ["--fix=lf"]
32 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_exact_match_agg_recall.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_dict
 2 | from .exact_str_match import ExactStrMatch
 3 | 
 4 | 
 5 | class DictExactStrMatchAggRecall:
 6 |     """Calculates the exact string match across the dict.
 7 | 
 8 |     1. Calculates the exact match for all keys in the solution
 9 |     2. Calculates the total, then divides by the size of the solution
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         responses = cast_to_dict(responses)
16 |         targets = cast_to_dict(targets)
17 | 
18 |         if not isinstance(responses, dict):
19 |             return 0
20 | 
21 |         num_keys = 0
22 |         total_score = 0
23 |         for key, answer in targets.items():
24 |             total_score += ExactStrMatch.match(responses.get(key), answer)
25 |             num_keys += 1
26 | 
27 |         return total_score / num_keys
28 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_nbbox_iou_tuple_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from .nbbox_iou import NbboxIouTuple
 2 | 
 3 | 
 4 | class DictNbboxIouTupleAggJaccard:
 5 |     """Calculates the average precision IoU across the dict.
 6 | 
 7 |     1. Calculates the precision IoU for all sets with the same key,
 8 |     if it appears in either pred or targets
 9 |     2. Calculates the total, then divides by the size of the union
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         if not isinstance(responses, dict):
16 |             return 0
17 |         all_keys = set(responses) | set(targets)
18 | 
19 |         num_keys = 0
20 |         total_score = 0
21 |         for key in all_keys:
22 |             total_score += NbboxIouTuple.match(
23 |                 responses.get(key, []), targets.get(key, [])
24 |             )
25 |             num_keys += 1
26 | 
27 |         return total_score / num_keys
28 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/simple_str_match.py:
--------------------------------------------------------------------------------
 1 | from .exact_str_match import ExactStrMatch
 2 | 
 3 | 
 4 | class SimpleStrMatch:
 5 |     """Basic string matching, without spaces or hyphens."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer: str) -> int:
 9 |         """Simple string match between response and correct_answer."""
10 |         if not isinstance(response, str):
11 |             response = str(response)  # If it is JSON-like
12 |         response = (
13 |             response.replace(" ", "")
14 |             .replace("-", "")
15 |             .replace("\n", "")
16 |             .replace("\t", "")
17 |             .replace(".", "")
18 |             .lower()
19 |         )
20 |         correct_answer = (
21 |             correct_answer.replace(" ", "")
22 |             .replace("-", "")
23 |             .replace("\n", "")
24 |             .replace("\t", "")
25 |             .replace(".", "")
26 |             .lower()
27 |         )
28 | 
29 |         return ExactStrMatch.match(response, correct_answer)
30 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_set_equality_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.dataset.utils.megabench.scoring.set_equality import SetEquality
 2 | 
 3 | 
 4 | class DictSetEqualityAggJaccard:
 5 |     """Calculates the average set equality across the dict.
 6 | 
 7 |     1. Calculates the set equality for all sets with the same key,
 8 |     if it appears in either pred or targets
 9 |     2. Calculates the total, then divides by the size of the union
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         if not isinstance(responses, dict):
16 |             return 0
17 | 
18 |         all_keys = set(responses) | set(targets)
19 | 
20 |         num_keys = 0
21 |         total_score = 0
22 |         for key in all_keys:
23 |             total_score += SetEquality.match(
24 |                 responses.get(key, []), targets.get(key, [])
25 |             )
26 |             num_keys += 1
27 | 
28 |         return total_score / num_keys
29 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_jaccard_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from .jaccard import Jaccard
 2 | from .common.conversions import cast_to_dict
 3 | 
 4 | 
 5 | class DictJaccardAggJaccard:
 6 |     """Calculates the Jaccard index, dividing by the union of the predictions.
 7 | 
 8 |     1. Calculates the Jaccard index for all sets with the same key,
 9 |     if it appears in either pred or targets
10 |     2. Calculates the total, then divides by the size of the union
11 |     """
12 | 
13 |     @classmethod
14 |     def match(cls, responses, targets) -> float:
15 |         """Return the aggregated Jaccard index between targets and responses."""
16 |         responses = cast_to_dict(responses)
17 |         if not isinstance(responses, dict):
18 |             return 0
19 | 
20 |         all_keys = set(responses) | set(targets)
21 | 
22 |         num_keys = 0
23 |         total_score = 0
24 |         for key in all_keys:
25 |             total_score += Jaccard.match(responses.get(key, []), targets.get(key, []))
26 |             num_keys += 1
27 | 
28 |         return total_score / num_keys
29 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/positive_int_match.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | 
 4 | class PositiveIntMatch:
 5 |     """Positive int matching."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response: str, correct_answer: str) -> int:
 9 |         """If the correct answer or response is a positive integer, then it returns if the predicted and correct answers are identical.
10 | 
11 |         Otherwise, it returns -1.
12 |         """
13 |         try:
14 |             response_obj = ast.literal_eval(response)
15 |         except (SyntaxError, ValueError):
16 |             return 0
17 | 
18 |         if not correct_answer:
19 |             return 0
20 | 
21 |         correct_answer_obj = ast.literal_eval(correct_answer)
22 | 
23 |         assert isinstance(correct_answer_obj, int)
24 |         if not isinstance(response_obj, int):
25 |             return 0
26 | 
27 |         # We only want to score the fields with a positive amount
28 |         if correct_answer_obj <= 0 and response_obj <= 0:
29 |             return -1
30 | 
31 |         return 1 if response_obj == correct_answer_obj else 0
32 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ...util.vision_encoder_config import qwen2vl_vit_config
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
 7 |     vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
 8 |     if getattr(vision_tower_cfg, "language", None) is None:
 9 |         vision_tower_cfg.language = "chinese" if "chinese" in vision_tower else "english"
10 |     print(f"language: {vision_tower_cfg.language}, vision_tower: {vision_tower}")
11 | 
12 |     if "siglip-so400m-patch14-384" in vision_tower:
13 |         from .siglip_encoder import SigLipVisionTower
14 |         qwen2vl_vision_tower = Qwen2VisionTransformerPretrainedModel._from_config(qwen2vl_vit_config)
15 |         qwen2vl_vision_tower.requires_grad_(False)
16 |         return SigLipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs), qwen2vl_vision_tower
17 |     else:
18 |         raise ValueError(f"Unknown vision tower: {vision_tower}")
19 | 


--------------------------------------------------------------------------------
/docs/en/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to the VLMEvalKit Tutorial!
 2 | ==========================================
 3 | 
 4 | VLMEvalKit Getting Started Guide
 5 | -------------------------------
 6 | 
 7 | To help users get started quickly, we recommend the following process:
 8 | 
 9 | - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process.
10 | 
11 | - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial."
12 | 
13 | We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit!
14 | 
15 | .. _Start Your First Step:
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: Start Your First Step
19 | 
20 |    Quickstart.md
21 | 
22 | .. _Advanced Tutorial:
23 | .. toctree::
24 |    :maxdepth: 1
25 |    :caption: Advanced Tutorial
26 | 
27 |    Development.md
28 |    ConfigSystem.md
29 | 
30 | .. _Other Notes:
31 | .. toctree::
32 |    :maxdepth: 1
33 |    :caption: Other Notes
34 | 
35 |    Contributors.md
36 | 
37 | Index and Tables
38 | ==================
39 | 
40 | * :ref:`genindex`
41 | * :ref:`search`
42 | 


--------------------------------------------------------------------------------
/docs/en/EvalByLMDeploy.md:
--------------------------------------------------------------------------------
 1 | # Using LMDeploy to Accelerate Evaluation and Inference
 2 | 
 3 | VLMEvalKit supports testing VLM models deployed by LMDeploy. Below, we use InternVL2-8B as an example to show how to test the model.
 4 | 
 5 | ## Step 0: Install LMDeploy
 6 | 
 7 | ```bash
 8 | pip install lmdeploy
 9 | ```
10 | For other installation methods, you can refer to LMDeploy's [documentation](https://github.com/InternLM/lmdeploy).
11 | 
12 | ## Step 1: Start the Inference Service
13 | 
14 | ```bash
15 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B
16 | ```
17 | > [!IMPORTANT]
18 | > Since models in VLMEvalKit may have custom behaviors when building prompts for different datasets, such as InternVL2's handling of HallusionBench, it is necessary to specify `--model-name` when starting the server. This allows the VLMEvalKit to select appropriate prompt construction strategy based on the name when using the LMDeploy API.
19 | >
20 | > If `--server-port`, is specified, the corresponding environment variable `LMDEPLOY_API_BASE` needs to be set.
21 | 
22 | 
23 | ## Step 2: Evaluation
24 | 
25 | ```bash
26 | python run.py --data MMStar --model lmdeploy --verbose --api-nproc 64
27 | ```
28 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/util/vision_encoder_config.py:
--------------------------------------------------------------------------------
 1 | from transformers import PretrainedConfig
 2 | 
 3 | 
 4 | siglip_config = PretrainedConfig.from_dict(
 5 |     {
 6 |         "attention_dropout": 0.0,
 7 |         "hidden_act": "gelu_pytorch_tanh",
 8 |         "hidden_size": 1152,
 9 |         "image_size": 384,
10 |         "intermediate_size": 4304,
11 |         "layer_norm_eps": 1e-06,
12 |         "model_type": "siglip_vision_model",
13 |         "num_attention_heads": 16,
14 |         "num_channels": 3,
15 |         "num_hidden_layers": 27,
16 |         "patch_size": 14,
17 |     }
18 | )
19 | 
20 | qwen2vl_vit_config = PretrainedConfig.from_dict(
21 |     {
22 |         "depth": 32,
23 |         "embed_dim": 1280,
24 |         "hidden_act": "quick_gelu",
25 |         "hidden_size": 3584,
26 |         "in_channels": 3,
27 |         "in_chans": 3,
28 |         "mlp_ratio": 4,
29 |         "model_type": "qwen2_vl",
30 |         "num_heads": 16,
31 |         "patch_size": 14,
32 |         "spatial_merge_size": 2,
33 |         "spatial_patch_size": 14,
34 |         "temporal_patch_size": 2,
35 |         "_attn_implementation": "flash_attention_2",
36 |         "_attn_implementation_internal": "flash_attention_2"
37 |     }
38 | )
39 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_nbbox_iou.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .common.metrics import calculate_iou
 3 | from .common.conversions import parse_bboxes_from_xml
 4 | from numbers import Number
 5 | 
 6 | 
 7 | class XmlNbboxIouSingle:
 8 |     """Calculates the IoU of bounding box.
 9 | 
10 |     Assumes that co-ordinates are normalized between 0 and 1 and that the bounding boxes
11 |     are of the form <box>top_left_x, top_left_y, bottom_right_x, bottom_right_y</box>
12 |     """
13 | 
14 |     @classmethod
15 |     def match(cls, responses, targets) -> float:
16 | 
17 |         logging.debug(f"{responses=}, {targets=}")
18 |         if not isinstance(responses, (tuple | list)):
19 |             responses = parse_bboxes_from_xml(responses)
20 |         if not isinstance(targets, (tuple | list)):
21 |             targets = parse_bboxes_from_xml(targets)
22 | 
23 |         if len(responses) == 0:
24 |             return 0
25 |         elif isinstance(responses[0], Number) and len(responses) == 4:
26 |             responses = [responses]
27 | 
28 |         iou_scores = calculate_iou(responses, targets)
29 |         if not iou_scores:
30 |             return 0
31 | 
32 |         # Take the mean IoU score for now.
33 |         return sum(iou_scores) / len(iou_scores)
34 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/ursa_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates 
 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 
 3 | # you may not use this file except in compliance with the License. 
 4 | # You may obtain a copy of the License at 
 5 | 
 6 | #     http://www.apache.org/licenses/LICENSE-2.0 
 7 | 
 8 | # Unless required by applicable law or agreed to in writing, software 
 9 | # distributed under the License is distributed on an "AS IS" BASIS, 
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
11 | # See the License for the specific language governing permissions and 
12 | # limitations under the License. 
13 | 
14 | from .image_processing_vlm import VLMImageProcessor, VLMImageProcessorConfig
15 | from .modeling_ursa import UrsaForConditionalGeneration, UrsaForTokenClassification
16 | from .processing_ursa import UrsaProcessor
17 | from .configuration_ursa import VisionConfig, UrsaConfig, AlignerConfig
18 | from .projector import MlpProjector
19 | 
20 | __all__ = [
21 |     "VLMImageProcessor",
22 |     "UrsaProcessor",
23 |     "UrsaForConditionalGeneration",
24 |     "UrsaForTokenClassification",
25 |     "VLMImageProcessorConfig",
26 |     "VisionConfig",
27 |     "MlpProjector",
28 |     "AlignerConfig",
29 |     "UrsaConfig"
30 | ]


--------------------------------------------------------------------------------
/docs/en/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | .header-logo {
 2 |   background-image: url("../image/logo.svg");
 3 |   background-size: 275px 80px;
 4 |   height: 80px;
 5 |   width: 275px;
 6 | }
 7 | 
 8 | 
 9 | @media screen and (min-width: 1100px) {
10 |   .header-logo {
11 |     top: -25px;
12 |   }
13 | }
14 | 
15 | pre {
16 |     white-space: pre;
17 | }
18 | 
19 | @media screen and (min-width: 2000px) {
20 |   .pytorch-content-left {
21 |     width: 1200px;
22 |     margin-left: 30px;
23 |   }
24 |   article.pytorch-article {
25 |     max-width: 1200px;
26 |   }
27 |   .pytorch-breadcrumbs-wrapper {
28 |     width: 1200px;
29 |   }
30 |   .pytorch-right-menu.scrolling-fixed {
31 |     position: fixed;
32 |     top: 45px;
33 |     left: 1580px;
34 |   }
35 | }
36 | 
37 | 
38 | article.pytorch-article section code {
39 |   padding: .2em .4em;
40 |   background-color: #f3f4f7;
41 |   border-radius: 5px;
42 | }
43 | 
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 |   padding: unset;
47 |   background-color: unset;
48 |   border-radius: unset;
49 | }
50 | 
51 | table.autosummary td {
52 |   width: 50%
53 | }
54 | 
55 | img.align-center {
56 |   display: block;
57 |   margin-left: auto;
58 |   margin-right: auto;
59 | }
60 | 
61 | article.pytorch-article p.rubric {
62 |   font-weight: bold;
63 | }
64 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | .header-logo {
 2 |   background-image: url("../image/logo.svg");
 3 |   background-size: 275px 80px;
 4 |   height: 80px;
 5 |   width: 275px;
 6 | }
 7 | 
 8 | 
 9 | @media screen and (min-width: 1100px) {
10 |   .header-logo {
11 |     top: -25px;
12 |   }
13 | }
14 | 
15 | pre {
16 |     white-space: pre;
17 | }
18 | 
19 | @media screen and (min-width: 2000px) {
20 |   .pytorch-content-left {
21 |     width: 1200px;
22 |     margin-left: 30px;
23 |   }
24 |   article.pytorch-article {
25 |     max-width: 1200px;
26 |   }
27 |   .pytorch-breadcrumbs-wrapper {
28 |     width: 1200px;
29 |   }
30 |   .pytorch-right-menu.scrolling-fixed {
31 |     position: fixed;
32 |     top: 45px;
33 |     left: 1580px;
34 |   }
35 | }
36 | 
37 | 
38 | article.pytorch-article section code {
39 |   padding: .2em .4em;
40 |   background-color: #f3f4f7;
41 |   border-radius: 5px;
42 | }
43 | 
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 |   padding: unset;
47 |   background-color: unset;
48 |   border-radius: unset;
49 | }
50 | 
51 | table.autosummary td {
52 |   width: 50%
53 | }
54 | 
55 | img.align-center {
56 |   display: block;
57 |   margin-left: auto;
58 |   margin-right: auto;
59 | }
60 | 
61 | article.pytorch-article p.rubric {
62 |   font-weight: bold;
63 | }
64 | 


--------------------------------------------------------------------------------
/docs/en/Contributors.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | ## Contributors w. 3+ Major Contributions
 4 | 
 5 | > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit.
 6 | 
 7 | New Qualified Contributors (2024.09):
 8 | 
 9 | 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM
10 | 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.)
11 | 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit
12 | 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE
13 | 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench
14 | 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID
15 | 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5)
16 | 
17 | ## Full Contributor List
18 | 
19 | > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit.
20 | 
21 | TBD.
22 | 


--------------------------------------------------------------------------------
/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gpt import OpenAIWrapper, GPT4V
 2 | from .hf_chat_model import HFChatModel
 3 | from .gemini import GeminiWrapper, GeminiProVision
 4 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
 5 | from .qwen_api import QwenAPI
 6 | from .claude import Claude_Wrapper, Claude3V
 7 | from .reka import Reka
 8 | from .glm_vision import GLMVisionAPI
 9 | from .cloudwalk import CWWrapper
10 | from .sensechat_vision import SenseChatVisionAPI
11 | from .siliconflow import SiliconFlowAPI, TeleMMAPI
12 | from .hunyuan import HunyuanVision
13 | from .bailingmm import bailingMMAPI
14 | from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
15 | from .jt_vl_chat import JTVLChatAPI
16 | from .taiyi import TaiyiAPI
17 | from .lmdeploy import LMDeployAPI
18 | from .taichu import TaichuVLAPI, TaichuVLRAPI
19 | from .doubao_vl_api import DoubaoVL
20 | from .mug_u import MUGUAPI
21 | 
22 | __all__ = [
23 |     'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V',
24 |     'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI',
25 |     'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI',
26 |     'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI',
27 |     'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI', 'bailingMMAPI',
28 |     'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI',
29 |     'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI"
30 | ]
31 | 


--------------------------------------------------------------------------------
/scripts/auto_run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from vlmeval.smp import *
 3 | from vlmeval.config import supported_VLM
 4 | 
 5 | def is_api(x):
 6 |     return getattr(supported_VLM[x].func, 'is_api', False)
 7 | 
 8 | models = list(supported_VLM)
 9 | models = [x for x in models if 'fs' not in x]
10 | models = [x for x in models if not is_api(x)]
11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2']
12 | models = [x for x in models if x not in exclude_list]
13 | 
14 | def is_large(x):
15 |     return '80b' in x or 'emu2' in x or '34B' in x
16 | 
17 | small_models = [x for x in models if not is_large(x)]
18 | large_models = [x for x in models if is_large(x)]
19 | models = small_models + large_models
20 | 
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--data', type=str, nargs='+', required=True)
23 | args = parser.parse_args()
24 | 
25 | # Skip some models
26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
27 | 
28 | for m in models:
29 |     unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
30 |     if len(unknown_datasets) == 0:
31 |         continue
32 |     dataset_str = ' '.join(unknown_datasets)
33 |     if '80b' in m:
34 |         cmd = f'python run.py --data {dataset_str} --model {m}'
35 |     else:
36 |         cmd = f'bash run.sh --data {dataset_str} --model {m}'
37 |     print(cmd)
38 |     os.system(cmd)


--------------------------------------------------------------------------------
/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from .base import BaseModel
 3 | from ..smp import *
 4 | 
 5 | 
 6 | class VisualGLM(BaseModel):
 7 | 
 8 |     INSTALL_REQ = False
 9 |     INTERLEAVE = False
10 | 
11 |     def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 |         try:
13 |             import sat
14 |         except Exception as err:
15 |             logging.critical('Please install SwissArmyTransformer to use VisualGLM')
16 |             raise err
17 | 
18 |         assert model_path is not None
19 |         self.model_path = model_path
20 | 
21 |         from transformers import AutoModel
22 |         from transformers import AutoTokenizer
23 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
24 |         model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
25 |         self.model = model
26 |         self.kwargs = kwargs
27 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
28 | 
29 |     def generate_inner(self, message, dataset=None):
30 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
31 |         output, _ = self.model.chat(
32 |             image_path=image_path,
33 |             tokenizer=self.tokenizer,
34 |             query=prompt,
35 |             history=[],
36 |             **self.kwargs
37 |         )
38 |         return output
39 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_dict
 2 | from .simple_str_match import ExactStrMatch
 3 | 
 4 | 
 5 | class DictEquality:
 6 |     """Calculates the exact string match across the dict.
 7 | 
 8 |     1. Calculates the exact match for all keys in the solution
 9 |     2. Calculates the total, then divides by the size of the solution
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         responses = cast_to_dict(responses)
16 |         targets = cast_to_dict(targets)
17 | 
18 |         if not isinstance(responses, dict):
19 |             return 0
20 | 
21 |         return 1 if responses == targets else 0
22 | 
23 | 
24 | class DictPrecision:
25 | 
26 |     @classmethod
27 |     def match(cls, responses, targets) -> float:
28 |         """Return the aggregated Jaccard index between targets and responses."""
29 |         responses = cast_to_dict(responses)
30 |         targets = cast_to_dict(targets)
31 | 
32 |         if not isinstance(responses, dict):
33 |             return 0
34 | 
35 |         if len(responses) == 0:
36 |             return 0
37 | 
38 |         matched = 0
39 |         for key, val in responses.items():
40 |             if key in targets:
41 |                 if ExactStrMatch.match(val, targets[key]):
42 |                     matched += 1
43 | 
44 |         return matched / len(responses)
45 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/falcon_vlm.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import requests
 3 | 
 4 | from .base import BaseModel
 5 | 
 6 | 
 7 | class Falcon2VLM(BaseModel):
 8 | 
 9 |     INSTALL_REQ = False
10 |     INTERLEAVE = False
11 | 
12 |     def __init__(self, model_path='tiiuae/falcon-11B-vlm', **kwargs):
13 |         import torch
14 |         from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
15 | 
16 |         self.model_path = model_path
17 |         self.processor = LlavaNextProcessor.from_pretrained(model_path, tokenizer_class='PreTrainedTokenizerFast')
18 |         self.model = LlavaNextForConditionalGeneration.from_pretrained(
19 |             model_path, torch_dtype=torch.bfloat16, device_map='cuda').eval()
20 |         default_kwargs = {'max_new_tokens': 512}
21 |         default_kwargs.update(kwargs)
22 |         self.kwargs = default_kwargs
23 | 
24 |     def generate_inner(self, message, dataset=None):
25 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
26 |         image = Image.open(image_path).convert('RGB')
27 | 
28 |         prompt = f'User:<image>\n{prompt} Falcon:'
29 |         inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
30 | 
31 |         output = self.model.generate(**inputs, **self.kwargs)
32 |         prompt_length = inputs['input_ids'].shape[1]
33 |         model_response = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()
34 |         return model_response
35 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_norm_point_in_bbox.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import parse_point_2d_from_xml, str_to_bboxes
 2 | 
 3 | 
 4 | class XmlNormPointInBbox:
 5 |     """Determines whether a point is located in a bounding box.
 6 | 
 7 |     Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is
 8 |     of the form <point>x, y</point>
 9 |     """
10 | 
11 |     @classmethod
12 |     def match(cls, responses, eval_context) -> int:
13 |         """Determine if the point is in the bounding box
14 |         and return which bounding box was matched, if any."""
15 |         bounding_box_has_match = {
16 |             bbox: False for bbox in eval_context["bounding_boxes"]
17 |         }
18 |         bounding_boxes = [
19 |             str_to_bboxes(bbox_str)[0] for bbox_str in eval_context["bounding_boxes"]
20 |         ]
21 |         assert bounding_boxes
22 | 
23 |         if not isinstance(responses, (tuple | list)):
24 |             responses = parse_point_2d_from_xml(responses)
25 |             if not responses:
26 |                 return 0, bounding_box_has_match
27 |         elif len(responses) != 2:
28 |             return 0, bounding_box_has_match
29 | 
30 |         x, y = responses
31 |         for min_x, min_y, max_x, max_y in bounding_boxes:
32 |             if min_x <= x <= max_x and min_y <= y <= max_y:
33 |                 bounding_box_has_match[str((min_x, min_y, max_x, max_y))] = True
34 |                 return 1, bounding_box_has_match
35 |         return 0, bounding_box_has_match
36 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/autolaporo_maneuver_classification.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | 
 4 | class AutoLaparoManeuverClassification(Dataset):
 5 |     def __init__(self, config, split):
 6 |         assert split in ['train', 'val', 'test']
 7 | 
 8 |         self.data_dir = config['data_config']['data_dir']
 9 |         self.split = split
10 | 
11 |         self.label_map = {'0': 'Static', '1': 'Up', '2': 'Down', '3': 'Left', '4': 'Right', '5': 'Zoom-in', '6': 'Zoom-out'}
12 |         self.start_clip = 228
13 |         self.labels = self.load_data()
14 |     
15 |     def load_data(self):
16 |         labels = []
17 | 
18 |         file_path = f'{self.data_dir}/laparoscope_motion_label.txt'
19 |         with open(file_path, 'r') as f:
20 |             label = [line.strip().split('\t') for line in f.readlines()]
21 |             label = label[1:]
22 |             for clip_name, maneuver_num, _ in label:
23 |                 
24 |                 if int(clip_name) < self.start_clip:
25 |                     continue
26 | 
27 |                 video_path = os.path.join(self.data_dir, 'clips_0_to_T', clip_name + '.mp4')
28 |                 labels.append((video_path, self.label_map[maneuver_num]))
29 | 
30 |         return labels
31 |     
32 |     def __len__(self):
33 |         return len(self.labels)
34 |     
35 |     def __getitem__(self, idx):
36 |         video_path, video_label = self.labels[idx]
37 | 
38 |         video = {'path': video_path}
39 |         return (
40 |             video,
41 |             video_label
42 |         )


--------------------------------------------------------------------------------
/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logging.basicConfig(
 3 |     format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
 4 |     datefmt='%Y-%m-%d %H:%M:%S')
 5 | 
 6 | logger_initialized = {}
 7 | 
 8 | 
 9 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
10 |     logger = logging.getLogger(name)
11 |     if name in logger_initialized:
12 |         return logger
13 | 
14 |     for logger_name in logger_initialized:
15 |         if name.startswith(logger_name):
16 |             return logger
17 | 
18 |     stream_handler = logging.StreamHandler()
19 |     handlers = [stream_handler]
20 | 
21 |     try:
22 |         import torch.distributed as dist
23 |         if dist.is_available() and dist.is_initialized():
24 |             rank = dist.get_rank()
25 |         else:
26 |             rank = 0
27 |     except ImportError:
28 |         rank = 0
29 | 
30 |     if rank == 0 and log_file is not None:
31 |         file_handler = logging.FileHandler(log_file, file_mode)
32 |         handlers.append(file_handler)
33 | 
34 |     formatter = logging.Formatter(
35 |         '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s')
36 |     for handler in handlers:
37 |         handler.setFormatter(formatter)
38 |         handler.setLevel(log_level)
39 |         logger.addHandler(handler)
40 | 
41 |     if rank == 0:
42 |         logger.setLevel(log_level)
43 |     else:
44 |         logger.setLevel(logging.ERROR)
45 | 
46 |     logger_initialized[name] = True
47 |     return logger
48 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/model/token_compressor/avgpool.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class AvgPoolTokenCompressor(nn.Module):
 5 |     """
 6 |     A PyTorch module for compressing tokens using average pooling.
 7 | 
 8 |     This module performs average pooling on the input tensor to reduce its spatial dimensions
 9 |     by a specified scale factor.
10 | 
11 |     Attributes:
12 |         scale (int): The scale factor for downsampling.
13 | 
14 |     Example:
15 |     >>> compressor = AvgPoolTokenCompressor(scale=2)
16 |     >>> input_tensor = torch.randn(1, 256, 4096) # Shape: [B, N, dim]
17 |     >>> output_tensor = compressor(input_tensor)
18 |     >>> print(output_tensor.shape) # Expected shape: [1, 64, 4096]
19 |     """
20 | 
21 |     def __init__(self, scale) -> None:
22 |         super(AvgPoolTokenCompressor, self).__init__()
23 |         self.scale = scale
24 | 
25 |     def _inner_forward(self, x):
26 |         scale = self.scale
27 |         B, N, dim = x.shape
28 |         H = W = int(N ** 0.5)
29 |         x = x.view(B, H, W, dim)
30 | 
31 |         return x.view(B, H // scale, scale, W // scale, scale, dim) \
32 |             .permute(0, 1, 3, 5, 2, 4) \
33 |             .reshape(B, H // scale, W // scale, dim, scale * scale) \
34 |             .mean(dim=-1) \
35 |             .squeeze(dim=-1) \
36 |             .reshape(B, -1, dim)
37 | 
38 |     def forward(self, x):
39 |         if type(x) is list:
40 |             x = [self._inner_forward(item.unsqueeze(0)).squeeze(0) for item in x]
41 |         else:
42 |             x = self._inner_forward(x)
43 |         return x
44 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_norm_point_distance.py:
--------------------------------------------------------------------------------
 1 | """Return the normalized point distance."""
 2 | 
 3 | from .common.conversions import parse_point_2d_from_xml
 4 | from .common.metrics import point_distance
 5 | 
 6 | 
 7 | class XmlNormPointDistance:
 8 |     """Determines the distance between two points in XML notation.
 9 | 
10 |     Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is
11 |     of the form <point>x, y</point>.
12 |     """
13 | 
14 |     @classmethod
15 |     def parse_2d_point(cls, point) -> tuple[float, float]:
16 |         """Parse a 2D point encoded in XML as <point>x, y</point>."""
17 |         if not isinstance(point, (tuple | list)):
18 |             point = parse_point_2d_from_xml(point)
19 |             if not point:
20 |                 raise ValueError("Point could not be parsed from XML string.")
21 |         elif len(point) != 2:
22 |             raise ValueError("Point is not 2D.")
23 |         if not all(0 <= comp <= 1 for comp in point):
24 |             raise ValueError("Point is not normalized.")
25 |         return tuple(point)
26 | 
27 |     @classmethod
28 |     def match(cls, responses, targets) -> float:
29 |         """Determine the normalized distance between two points."""
30 |         try:
31 |             responses = cls.parse_2d_point(responses)
32 |             targets = cls.parse_2d_point(targets)
33 |         except ValueError:
34 |             return 0
35 | 
36 |         # Instead of normalizing by 1/sqrt(2), we just set it to 0 if the distance is above 1.
37 |         return max(0, 1 - point_distance(responses, targets))
38 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/jigsaws_gesture_classification.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | 
 4 | class JIGSAWSGestureClassification(Dataset):
 5 |     def __init__(self, config, split, transform=None, use_api=False):
 6 |         assert split in ['train', 'val', 'test']
 7 |         self.data_dir = config['data_config']['data_dir']
 8 |         self.category = config['data_config']['category'] # knot tying, needle passing, suturing
 9 |         assert self.category in ['Knot_Tying', 'Needle_Passing', 'Suturing']
10 |         self.split = split
11 |         self.labels = self.load_data()
12 | 
13 |     def load_data(self):
14 |         labels = []
15 | 
16 |         label_dir = f'{self.data_dir}{self.category}/transcriptions/'
17 |         video_dir = f'{self.data_dir}{self.category}/video/'
18 |         for filename in os.listdir(label_dir):
19 |             file_path = os.path.join(label_dir, filename)
20 | 
21 |             with open(file_path, 'r') as f:
22 |                 label = [line.strip().split(' ') for line in f.readlines()]
23 | 
24 |                 # need processed videos according to start and end frame
25 |                 for start_frame, end_frame, class_label in label:
26 |                     video_path = os.path.join(self.video_dir, f'{filename.replace(self.category + "_", "").replace(".txt", "")}_{start_frame}_{end_frame}_{class_label}.mp4')
27 |                     labels.append((video_path, class_label))
28 | 
29 |         return labels
30 | 
31 |     def __len__(self):
32 |         return len(self.labels)
33 | 
34 |     def __getitem__(self, idx):
35 |         video_path, video_label = self.labels[idx]
36 |         return (
37 |             video_path,
38 |             video_label
39 |         )


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/hrbench.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | import os
 3 | 
 4 | 
 5 | def report_acc_hrbench(df):
 6 |     cycle_group = df.groupby('cycle_category')
 7 |     result_dic = defaultdict(list)
 8 |     avg_dic = defaultdict(int)
 9 | 
10 |     count = 0
11 |     for key, data_value in cycle_group:
12 |         count += 1
13 |         _, resp_dic = hrbench_score(data_value)
14 | 
15 |         for task_type, accuracy in resp_dic.items():
16 |             result_dic['cycle'].append(key)
17 |             result_dic['type'].append(task_type)
18 |             result_dic['accuracy'].append(accuracy)
19 | 
20 |             avg_dic[task_type] += accuracy
21 |     for task_type, accuracy in avg_dic.items():
22 |         result_dic['cycle'].append('Average')
23 |         result_dic['type'].append(task_type)
24 |         result_dic['accuracy'].append(accuracy / count)
25 |     result_pd = pd.DataFrame(result_dic)
26 | 
27 |     return result_pd
28 | 
29 | 
30 | def hrbench_score(data):
31 |     ret = defaultdict(list)
32 |     resp_dic = {}
33 |     category_list = set(data['category'])
34 |     score_dict = defaultdict(list)
35 | 
36 |     for i in range(len(data)):
37 |         d = data.iloc[i]
38 |         category = d['category']
39 |         gpt_score = d['hit']
40 |         score_dict[category].append(gpt_score)
41 |         score_dict['all'].append(gpt_score)
42 | 
43 |     all_acc = np.mean(score_dict['all'])
44 |     ret['type'].append('all')
45 |     ret['acc'].append(all_acc)
46 |     resp_dic['all'] = all_acc
47 |     for cate in category_list:
48 |         acc = np.mean(score_dict[cate])
49 |         ret['type'].append(cate)
50 |         ret['acc'].append(acc)
51 | 
52 |         resp_dic[cate] = acc
53 | 
54 |     return pd.DataFrame(ret), resp_dic
55 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/chameleon.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import warnings
 3 | from .base import BaseModel
 4 | from ..smp import *
 5 | from PIL import Image
 6 | import torch
 7 | 
 8 | 
 9 | class Chameleon(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self, model_path='facebook/chameleon-7b', **kwargs):
15 |         try:
16 |             from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
17 |         except Exception as e:
18 |             logging.critical('Please install the latest transformers.')
19 |             raise e
20 | 
21 |         processor = ChameleonProcessor.from_pretrained(model_path)
22 |         model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16)
23 | 
24 |         self.model = model.cuda().eval()
25 |         self.processor = processor
26 | 
27 |     def generate_inner(self, message, dataset=None):
28 |         content, images = '', []
29 |         for x in message:
30 |             if x['type'] == 'text':
31 |                 content += x['value']
32 |             elif x['type'] == 'image':
33 |                 content += '<image>\n'
34 |                 images.append(Image.open(x['value']))
35 | 
36 |         inputs = self.processor(
37 |             text=[content],
38 |             images=images,
39 |             padding=True,
40 |             return_tensors='pt'
41 |         ).to(device='cuda', dtype=torch.bfloat16)
42 |         generate_ids = self.model.generate(**inputs, max_new_tokens=2048)
43 |         input_token_len = inputs.input_ids.shape[1]
44 |         text = self.processor.batch_decode(
45 |             generate_ids[:, input_token_len:],
46 |             skip_special_tokens=True,
47 |             clean_up_tokenization_spaces=False
48 |         )[0]
49 |         return text
50 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/exact_str_match.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from ..parsing.common.utils import extract_code_block_content
 3 | 
 4 | 
 5 | def parse_single_letter(s):
 6 |     # Regular expression to match (A)XXXXX, A . XXXXXXX, or A.XXXXXX
 7 |     match = re.match(r"^\(?([A-Za-z])\)?(?:\s*\.\s*|\.)?(.*)", s)
 8 | 
 9 |     if match:
10 |         # Extract and return the single letter
11 |         return match.group(1)
12 |     else:
13 |         # Return the original string if no match is found
14 |         return s
15 | 
16 | 
17 | class ExactStrMatch:
18 |     """Exact string matching."""
19 | 
20 |     @staticmethod
21 |     def match(response: str, correct_answer: str) -> int:
22 |         """Exact match between targets and responses."""
23 |         if not isinstance(response, str):
24 |             response = str(response)
25 |         if not isinstance(correct_answer, str):
26 |             correct_answer = str(correct_answer)
27 | 
28 |         if len(correct_answer) == 1 and correct_answer.isalpha() and len(response) > 1:
29 |             # handle special case of choice letter,
30 |             # drop the potential parenthesis
31 |             response = parse_single_letter(response)
32 | 
33 |         return 1 if response == correct_answer else 0
34 | 
35 | 
36 | class CodeResultExactStrMatch:
37 |     """Exact string matching, with the results from a results code block."""
38 | 
39 |     @staticmethod
40 |     def match(response: str, correct_answer: str) -> int:
41 |         """Exact match between targets and responses."""
42 |         correct_answer, is_code = extract_code_block_content(
43 |             correct_answer,
44 |             is_ascii_art=True,
45 |             should_remove_surrounding_whitespace=False,
46 |         )
47 |         # assert is_code
48 |         return ExactStrMatch.match(response, correct_answer)
49 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/heichole_helpers.py:
--------------------------------------------------------------------------------
 1 |  # added by Anita Rau April 2025
 2 | 
 3 | 
 4 | def extract_heichole_frames(video_folder, output_folder):
 5 |     """
 6 |     Extracts frames from all .mp4 videos in the specified video folder and saves
 7 |     them in corresponding subdirectories within the output folder.
 8 | 
 9 |     Parameters:
10 |     video_folder (str): The path to the folder containing .mp4 video files.
11 |     output_folder (str): The path to the folder where extracted frames will be saved.
12 |     """
13 |     import os
14 |     import subprocess       
15 |     # Create the output folder if it doesn't exist
16 |     if not os.path.exists(output_folder):
17 |         os.makedirs(output_folder)
18 | 
19 |     # Get a list of all video files in the video folder
20 |     videos = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]
21 |     #videos = [f for f in videos if int(f.split('Chole')[1].split('.')[0]) in [6, 19, 10, 8, 20]] # val seq
22 |     videos = [f for f in videos if int(f.split('Chole')[1].split('.')[0]) in [4, 1, 22, 16, 13]] # test seqs
23 | 
24 |     # Loop through each video file
25 |     for video in videos:
26 |         # Extract the base name of the video without the extension
27 |         base_name = os.path.splitext(video)[0]
28 |         
29 |         # Create a directory for the extracted frames for each video
30 |         output_path = os.path.join(output_folder, base_name)
31 |         if not os.path.exists(output_path):
32 |             os.makedirs(output_path)
33 |         
34 |         # Construct the FFmpeg command
35 |         command = [
36 |             'ffmpeg',
37 |             '-i', os.path.join(video_folder, video),
38 |             '-start_number', '0',
39 |             os.path.join(output_path, 'frame_%05d.png')
40 |         ]
41 |         
42 |         # Run the command
43 |         subprocess.run(command)
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_projector/speech_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | class EncoderProjectorConcat(nn.Module):
 6 |     def __init__(self, config):
 7 |         super().__init__()
 8 |         self.k = config.speech_encoder_ds_rate
 9 |         self.encoder_dim = config.speech_encoder_hidden_size
10 |         self.llm_dim = config.hidden_size
11 |         self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
12 |         self.relu = nn.ReLU()
13 |         self.linear2 = nn.Linear(2048, config.hidden_size)
14 | 
15 |         embed_std = 1 / math.sqrt(config.hidden_size)
16 |         self.speech_newline = nn.Parameter(
17 |             torch.randn(config.hidden_size) * embed_std
18 |         )
19 |         self.speech_begin = nn.Parameter(
20 |             torch.randn(config.hidden_size) * embed_std
21 |         )
22 |         self.speech_end = nn.Parameter(
23 |             torch.randn(config.hidden_size) * embed_std
24 |         )
25 | 
26 |     def forward(self, x):
27 |         batch_size, seq_len, dim = x.size()
28 |         num_frames_to_discard = seq_len % self.k
29 |         if num_frames_to_discard > 0:
30 |             x = x[:, :-num_frames_to_discard, :]
31 |         seq_len = x.size(1)
32 |         
33 |         x = x.contiguous()
34 |         x = x.view(batch_size, seq_len // self.k, dim * self.k)
35 |         x = self.linear1(x)
36 |         x = self.relu(x)
37 |         x = self.linear2(x)
38 |         x = torch.cat([
39 |             x,
40 |             self.speech_newline.reshape(1, 1, -1).expand(batch_size, 1, -1).to(x.dtype)
41 |         ], dim=1)
42 |         begin = self.speech_begin.reshape(1, -1).to(x.dtype)
43 |         end = self.speech_end.reshape(1, -1).to(x.dtype)
44 |         x = x.flatten(0, 1)
45 |         x = torch.cat([begin, x, end], dim=0)
46 |         # x = x.flatten(0, 1)
47 |         return x


--------------------------------------------------------------------------------
/vlmeval/vlm/mixsense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from PIL import Image
 5 | import warnings
 6 | 
 7 | from .base import BaseModel
 8 | from ..smp import *
 9 | 
10 | 
11 | class LLama3Mixsense(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
17 |         assert model_path is not None
18 |         transformers.logging.set_verbosity_error()
19 |         transformers.logging.disable_progress_bar()
20 |         warnings.filterwarnings('ignore')
21 |         self.tokenizer = AutoTokenizer.from_pretrained(
22 |             model_path, trust_remote_code=True
23 |         )
24 |         self.model = AutoModelForCausalLM.from_pretrained(
25 |             model_path, trust_remote_code=True
26 |         ).to('cuda').eval()
27 |         self.kwargs = kwargs
28 | 
29 |     def generate_inner(self, message, dataset=None):
30 |         prompt, image_path = self.message_to_promptimg(message)
31 |         input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
32 |         image = Image.open(image_path).convert('RGB')
33 |         image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
34 |         # generate
35 |         with torch.inference_mode():
36 |             output_ids = self.model.generate(
37 |                 input_ids,
38 |                 images=image_tensor,
39 |                 max_new_tokens=2048,
40 |                 use_cache=True,
41 |                 eos_token_id=[
42 |                     self.tokenizer.eos_token_id,
43 |                     self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
44 |                 ],
45 |             )
46 |         return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
47 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/configs/videochat2_hd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |       "model_cls": "VideoChat2_it_hd_mistral",
 4 |       "vit_blip_model_path": "OpenGVLab/videochat2",
 5 |       "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
 6 |       "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B",
 7 |       "freeze_vit": false,
 8 |       "freeze_qformer": false,
 9 |       "max_txt_len": 512,
10 |       "low_resource": false,
11 |       "vision_encoder": {
12 |         "name": "vit_l14",
13 |         "img_size": 224,
14 |         "patch_size": 16,
15 |         "d_model": 1024,
16 |         "encoder_embed_dim": 1024,
17 |         "encoder_depth": 24,
18 |         "encoder_num_heads": 16,
19 |         "drop_path_rate": 0.0,
20 |         "num_frames": 8,
21 |         "tubelet_size": 1,
22 |         "use_checkpoint": true,
23 |         "checkpoint_num": 18,
24 |         "pretrained": "",
25 |         "return_index": -2,
26 |         "vit_add_ln": true,
27 |         "ckpt_num_frame": 4
28 |       },
29 |       "num_query_token": 32,
30 |       "qformer_hidden_dropout_prob": 0.1,
31 |       "qformer_attention_probs_dropout_prob": 0.1,
32 |       "qformer_drop_path_rate": 0.2,
33 |       "extra_num_query_token": 64,
34 |       "qformer_text_input": true,
35 |       "system": "",
36 |       "start_token": "<Video>",
37 |       "end_token": "</Video>",
38 |       "add_second_msg": true,
39 |       "img_start_token": "<Image>",
40 |       "img_end_token": "</Image>",
41 |       "random_shuffle": true,
42 |       "return_question_instruction": false,
43 |       "use_flash_attention": true,
44 |       "use_lora": false,
45 |       "lora_r": 16,
46 |       "lora_alpha": 32,
47 |       "lora_dropout": 0.1,
48 |       "dynamic_config": {
49 |         "local_size": 224,
50 |         "hd_num": 6,
51 |         "padding": false,
52 |         "add_global": true
53 |       }
54 |     },
55 |     "device": "cuda"
56 | }
57 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/judge_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ...smp import load_env
 3 | 
 4 | INTERNAL = os.environ.get('INTERNAL', 0)
 5 | 
 6 | 
 7 | def build_judge(**kwargs):
 8 |     from ...api import OpenAIWrapper, SiliconFlowAPI, HFChatModel
 9 |     model = kwargs.pop('model', None)
10 |     kwargs.pop('nproc', None)
11 |     load_env()
12 |     LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
13 |     if LOCAL_LLM is None:
14 |         model_map = {
15 |             'gpt-4-turbo': 'gpt-4-1106-preview',
16 |             'gpt-4-0613': 'gpt-4-0613',
17 |             'gpt-4-0125': 'gpt-4-0125-preview',
18 |             'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
19 |             'chatgpt-1106': 'gpt-3.5-turbo-1106',
20 |             'chatgpt-0125': 'gpt-3.5-turbo-0125',
21 |             'gpt-4o': 'gpt-4o-2024-05-13',
22 |             'gpt-4o-0806': 'gpt-4o-2024-08-06',
23 |             'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
24 |             'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
25 |             'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
26 |             'deepseek': 'deepseek-ai/DeepSeek-V2.5',
27 |             'llama31-8b': 'meta-llama/Llama-3.1-8B-Instruct',
28 |         }
29 |         model_version = model_map[model]
30 |     else:
31 |         model_version = LOCAL_LLM
32 | 
33 |     if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
34 |         model = SiliconFlowAPI(model_version, **kwargs)
35 |     elif model == 'llama31-8b':
36 |         model = HFChatModel(model_version, **kwargs)
37 |     else:
38 |         model = OpenAIWrapper(model_version, **kwargs)
39 |     return model
40 | 
41 | 
42 | DEBUG_MESSAGE = """
43 | To debug the OpenAI API, you can try the following scripts in python:
44 | ```python
45 | from vlmeval.api import OpenAIWrapper
46 | model = OpenAIWrapper('gpt-4o', verbose=True)
47 | msgs = [dict(type='text', value='Hello!')]
48 | code, answer, resp = model.generate_inner(msgs)
49 | print(code, answer, resp)
50 | ```
51 | You cam see the specific error if the API call fails.
52 | """
53 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/response_parse_type.py:
--------------------------------------------------------------------------------
 1 | from functools import cached_property
 2 | from enum import Enum
 3 | from .parsing.json_parse import JsonParse
 4 | from .parsing.answer_str_parse import (
 5 |     AnswerStrParse,
 6 |     AsciiAnswerStrParse,
 7 |     VerbatimAnswerStrParse,
 8 | )
 9 | from vlmeval.dataset.utils.megabench.parsing.dummy_parse import DummyParse
10 | 
11 | 
12 | class ResponseParseType(Enum):
13 |     """Parse the response."""
14 | 
15 |     JSON = "json"
16 |     ANSWER_STR = "answer_string"
17 |     ASCII_ANSWER_STR = "ascii_answer_string"
18 |     VERBATIM_ANSWER_STR = "verbatim_answer_string"
19 |     DUMMY = "dummy"
20 |     UNSUPPORTED = "unsupported"
21 | 
22 |     @cached_property
23 |     def class_impl(self):
24 |         if self == ResponseParseType.ANSWER_STR:
25 |             return AnswerStrParse
26 |         elif self == ResponseParseType.ASCII_ANSWER_STR:
27 |             return AsciiAnswerStrParse
28 |         elif self == ResponseParseType.VERBATIM_ANSWER_STR:
29 |             return VerbatimAnswerStrParse
30 |         elif self == ResponseParseType.DUMMY:
31 |             return DummyParse
32 |         else:
33 |             return JsonParse
34 | 
35 |     def is_single_field_parser(self):
36 |         return self in [
37 |             ResponseParseType.ANSWER_STR,
38 |             ResponseParseType.ASCII_ANSWER_STR,
39 |             ResponseParseType.VERBATIM_ANSWER_STR,
40 |         ]
41 | 
42 |     def parse(self, response: str, *args, **kwargs):
43 |         """Parse the response."""
44 |         return self.class_impl.parse(response, *args, **kwargs)
45 | 
46 |     @staticmethod
47 |     def from_string(s):
48 |         """Initialize the response parsing type from a string."""
49 |         try:
50 |             if s is None:
51 |                 return ResponseParseType("unsupported")
52 |             return ResponseParseType(s.lower())
53 |         except KeyError as exc:
54 |             raise ValueError(f"Invalid metric type: {s}") from exc
55 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/cholec80_phase_recognition.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | 
 4 | 
 5 | class Cholec80PhaseRecognition(Dataset):
 6 |     def __init__(self, config, split):
 7 |         assert split in ['train', 'val', 'test']
 8 |         self.image_dir = os.path.join(config['data_config']['data_dir'], 'frames_25fps', split)
 9 |         self.data_dir = config['data_config']['data_dir']
10 | 
11 |         self.map = {phase: idx for idx, phase in enumerate(config['label_names'])} 
12 |         self.split = split
13 |         self.few_shot = True if config['shots'] != 'zero' else False
14 |         self.labels = self.load_labels()
15 |         
16 | 
17 |     def load_labels(self):
18 |         labels = []
19 |         fps_rate = 25
20 |         if self.split == 'test':
21 |             fps_rate = fps_rate * 5
22 |             if self.few_shot:
23 |                 fps_rate = fps_rate * 15
24 |         for video_name in os.listdir(self.image_dir):
25 |             video_labels_path = os.path.join(self.data_dir, 'phase_annotations', f'{video_name}-phase.txt')
26 |             with open(video_labels_path, 'r') as f:
27 |                 lines = f.readlines()
28 |                 for line in lines:
29 |                     line = line.strip().split()
30 |                     if line[0] != 'Frame':
31 |                         if int(line[0]) % fps_rate != 0:
32 |                             continue  # only sample 1/5 frame per sec
33 |                         frame_label = self.map[line[1]]
34 |                         frame_name = f'{line[0]}.jpg'
35 |                         frame_path = os.path.join(self.image_dir, video_name, frame_name)
36 |                         labels.append((frame_path, frame_label))
37 |         labels = labels[:10]  # TODO remove this line - for debugging
38 |         return labels
39 | 
40 |     def __len__(self):
41 |         return len(self.labels)
42 | 
43 |     def __getitem__(self, idx):
44 |         frame_name, frame_label = self.labels[idx]
45 | 
46 |         frame = {'path': frame_name}
47 |         return (
48 |             frame,
49 |             frame_label
50 |         )


--------------------------------------------------------------------------------
/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | from time import sleep
 4 | import mimetypes
 5 | 
 6 | 
 7 | class Reka_Wrapper(BaseAPI):
 8 | 
 9 |     is_api: bool = True
10 |     INTERLEAVE: bool = False
11 | 
12 |     def __init__(self,
13 |                  model: str = 'reka-flash-20240226',
14 |                  key: str = None,
15 |                  retry: int = 10,
16 |                  wait: int = 3,
17 |                  system_prompt: str = None,
18 |                  verbose: bool = True,
19 |                  temperature: float = 0,
20 |                  max_tokens: int = 1024,
21 |                  **kwargs):
22 | 
23 |         try:
24 |             import reka
25 |         except ImportError:
26 |             raise ImportError('Please install reka by running "pip install reka-api"')
27 | 
28 |         self.model = model
29 |         default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 |         default_kwargs.update(kwargs)
31 |         self.kwargs = default_kwargs
32 |         if key is not None:
33 |             self.key = key
34 |         else:
35 |             self.key = os.environ.get('REKA_API_KEY', '')
36 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 | 
38 |     def generate_inner(self, inputs, **kwargs) -> str:
39 |         import reka
40 |         reka.API_KEY = self.key
41 |         dataset = kwargs.pop('dataset', None)
42 |         prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
43 |         image_b64 = encode_image_file_to_base64(image_path)
44 | 
45 |         response = reka.chat(
46 |             model_name=self.model,
47 |             human=prompt,
48 |             media_url=f'data:image/jpeg;base64,{image_b64}',
49 |             **self.kwargs)
50 | 
51 |         try:
52 |             return 0, response['text'], response
53 |         except Exception as err:
54 |             return -1, self.fail_msg + str(err), response
55 | 
56 | 
57 | class Reka(Reka_Wrapper):
58 | 
59 |     def generate(self, message, dataset=None):
60 |         return super(Reka_Wrapper, self).generate(message)
61 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/jigsaws_skill_assessment.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | 
 4 | class JIGSAWSSkillAssessment(Dataset):
 5 |     def __init__(self, config, split):
 6 |         assert split in ['train', 'val', 'test']
 7 |         self.data_dir = config['data_config']['data_dir']
 8 |         self.category = config['data_config']['category'] # knot tying, needle passing, suturing
 9 |         self.dataset_name = config['data_config']['dataset_name']
10 |         self.score = config['data_config']['score']
11 |         assert self.category in ['Knot_Tying', 'Needle_Passing', 'Suturing']
12 |         self.split = split
13 |         self.labels = self.load_data()
14 | 
15 |     def load_data(self):
16 |         labels = []
17 | 
18 |         label_path = f'{self.data_dir}{self.category}/meta_file_{self.category}.txt'
19 |         video_dir = f'{self.data_dir}{self.category}/video/'
20 |         with open (label_path, 'r') as f:
21 |             label = [line.strip().split('\t') for line in f.readlines()]
22 |             for example in label:
23 |                 if len(example) == 1: continue
24 |                 cols = ['video_name', 'experience', 'grs', 'respect_for_tissue', 'suture_needle_handling', 'time_and_motion', 'flow_of_operation', 'overall_performance', 'quality_of_final_product']
25 |                 example = [s for s in example if s != '']
26 |                 assert len(example) == len(cols)
27 |                 video_name = example[cols.index('video_name')]
28 |                 video_name = f'{video_name.replace(self.category + "_", "")}.mp4'
29 |                 _, video_name = os.path.split(video_name)
30 |                 video_path = os.path.join(video_dir, video_name)
31 |                 score = example[cols.index(self.score)]
32 |                 
33 |                 labels.append((video_path, score))
34 |         return labels
35 | 
36 |     def __len__(self):
37 |         return len(self.labels)
38 | 
39 |     def __getitem__(self, idx):
40 |         video_path, video_label = self.labels[idx]
41 |         video = {'path': video_path}
42 |         return (
43 |             video,
44 |             video_label
45 |         )
46 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/mse.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import numpy as np
 3 | import math
 4 | from .common.metrics import mse
 5 | from .common.conversions import str_to_list
 6 | 
 7 | 
 8 | class MSE:
 9 |     """Mean Squared Error."""
10 | 
11 |     @staticmethod
12 |     def match(response: str, correct_answer: str) -> int:
13 |         """Return the mean squared error."""
14 |         try:
15 |             return mse(ast.literal_eval(response), ast.literal_eval(correct_answer))
16 |         except (SyntaxError, ValueError):
17 |             return 0
18 | 
19 | 
20 | class NormalizedRMSE:
21 |     """Mean Squared Error."""
22 | 
23 |     MIN = 0.0
24 |     MAX = 0.1
25 | 
26 |     @classmethod
27 |     def match(cls, response: str, correct_answer: str) -> int:
28 |         """Return the mean squared error."""
29 |         try:
30 |             mse_val = mse(ast.literal_eval(response), ast.literal_eval(correct_answer))
31 |             rmse = np.clip(np.sqrt(mse_val), cls.MIN, cls.MAX)
32 |             norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN)
33 |             return norm_rmse
34 |         except (SyntaxError, ValueError):
35 |             return 0
36 | 
37 | 
38 | class AngleSeqFloatRMSE:
39 |     """Whether the sequence of numbers is close enough to the real answer."""
40 | 
41 |     MIN = 0.0
42 |     MAX = 10.0
43 | 
44 |     @classmethod
45 |     def match(cls, responses, targets) -> float:
46 |         """Determines whether the sequence of floats are close enough to the real answer."""
47 |         responses = str_to_list(responses)
48 |         targets = str_to_list(targets)
49 | 
50 |         if len(responses) != len(targets):
51 |             return 0
52 | 
53 |         try:
54 |             res = np.array(responses)
55 |             tgt = np.array(targets)
56 |             rmse = np.sqrt(mse(res, tgt)).sum() / len(targets)
57 |         except:  # cannot obtain the rmse from the response, return 0
58 |             return 0
59 | 
60 |         rmse = np.clip(rmse, cls.MIN, cls.MAX)
61 |         norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN)
62 |         if math.isnan(norm_rmse):
63 |             return 0
64 |         return norm_rmse
65 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/qbench_video.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | from .multiple_choice import extract_answer_from_item
 3 | from PIL import Image, ImageOps
 4 | import numpy as np
 5 | 
 6 | FAIL_MSG = 'Failed to obtain answer via API.'
 7 | 
 8 | VQA_JUDGE_SYS_PROMPT = """
 9 | You are a helpful assistant that grades answers related to visual video quality. 
10 | There are a lot of special terms or keywords related to video processing and photography. 
11 | You will pay attention to the context of `quality evaluation' when grading. 
12 | """
13 | 
14 | VQA_JUDGE_USER_PROMPT = """
15 | Given the question {}, evaluate whether the response {} completely matches the correct answer {}. 
16 | First, check the response and please rate score 0 if the response is not a valid answer.
17 | Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 
18 | Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
19 | Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
20 | Please only provide the result in the following format: Score:'
21 | """
22 | 
23 | 
24 | def check_ans_mcq(pred, ans, correct_choice, correct_answer):
25 |     flag = False
26 | 
27 |     if correct_choice == pred or correct_choice+"." in pred or correct_answer == pred:
28 |         flag = True
29 |     elif correct_choice in pred.split("\n"):
30 |         flag = True
31 | 
32 |     return flag
33 | 
34 | def check_ans_vqa(model, line):
35 |     score = model.generate(VQA_JUDGE_USER_PROMPT.format(line['question'], line['prediction'], line['answer'])).strip()
36 |     return score
37 | 
38 | def get_dimension_rating(score_file):
39 |     score = load(score_file)
40 |     result_dict = {}
41 |     for idx, item in score.iterrows():
42 |         question_type = eval(item['dimensions'])[0].split(',')[0]
43 |         if question_type not in result_dict:
44 |             result_dict[question_type] = [0, 0]
45 |         result_dict[question_type][0] += int(item['score'])
46 |         result_dict[question_type][1] += 1
47 |     return result_dict


--------------------------------------------------------------------------------
/vlmeval/vlm/phi4_multimodal.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class Phi4Multimodal(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='microsoft/Phi-4-multimodal-instruct', **kwargs):
14 |         try:
15 |             from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
16 |         except Exception as e:
17 |             logging.critical('Please install the latest version transformers.')
18 |             raise e
19 | 
20 |         model = AutoModelForCausalLM.from_pretrained(
21 |             model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto',attn_implementation='flash_attention_2').eval()
22 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
23 |         generation_config = GenerationConfig.from_pretrained(model_path)
24 |         
25 |         self.model = model
26 |         self.processor = processor
27 |         # self.kwargs = kwargs
28 |         self.generation_config=generation_config
29 | 
30 |     def generate_inner(self, message, dataset=None):
31 |         user_question = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text'])
32 |         images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image']
33 | 
34 | 
35 |         user_prompt = '<|user|>'
36 |         assistant_prompt = '<|assistant|>'
37 |         prompt_suffix = '<|end|>'
38 |         prompt = f'{user_prompt}<|image_1|>{user_question}{prompt_suffix}{assistant_prompt}'
39 |         inputs = self.processor(text=prompt, images=images[0], return_tensors='pt').to('cuda') 
40 | 
41 |         # Generate response
42 |         generate_ids = self.model.generate(
43 |             **inputs,
44 |             max_new_tokens=1000,
45 |             generation_config=self.generation_config,
46 |         )
47 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
48 |         response = self.processor.batch_decode(
49 |             generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
50 |         )[0]
51 |         return response
52 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import os.path as osp
 4 | import sys
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class InstructBLIP(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name):
15 |         self.config_map = {
16 |             'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 |             'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 |         }
19 | 
20 |         self.file_path = __file__
21 |         config_root = osp.dirname(self.file_path)
22 | 
23 |         try:
24 |             from lavis.models import load_preprocess
25 |             from omegaconf import OmegaConf
26 |             from lavis.common.registry import registry
27 |         except Exception as e:
28 |             logging.critical('Please install lavis before using InstructBLIP. ')
29 |             raise e
30 | 
31 |         assert name in self.config_map
32 |         cfg_path = osp.join(config_root, self.config_map[name])
33 |         cfg = OmegaConf.load(cfg_path)
34 | 
35 |         model_cfg = cfg.model
36 |         assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 |         model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 |         model = model_cls.from_config(model_cfg)
39 |         model.eval()
40 | 
41 |         self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 |         device = self.device
43 |         model.to(device)
44 |         self.model = model
45 |         self.kwargs = {'max_length': 512}
46 | 
47 |         preprocess_cfg = cfg.preprocess
48 |         vis_processors, _ = load_preprocess(preprocess_cfg)
49 |         self.vis_processors = vis_processors
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
53 |         vis_processors = self.vis_processors
54 |         raw_image = Image.open(image_path).convert('RGB')
55 |         image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 |         outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 |         return outputs[0]
58 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-run-test.yml:
--------------------------------------------------------------------------------
 1 | name: pr_run_test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - "main"
 7 |     paths-ignore:
 8 |       - "docs/**"
 9 |       - "**.md"
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | env:
16 |   BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2-VL-7B-Instruct":0.8727272727272727,"InternVL2_5-8B":0.8727272727272727,"llava_onevision_qwen2_7b_si":0.8363636363636363},"MMStar_MINI":{"Qwen2-VL-7B-Instruct":0.6266666666666667,"InternVL2_5-8B":0.6333333333333333,"llava_onevision_qwen2_7b_si":0.49333333333333335},"AI2D_MINI":{"Qwen2-VL-7B-Instruct":0.7854251012145749,"InternVL2_5-8B":0.8421052631578947,"llava_onevision_qwen2_7b_si":0.8178137651821862},"OCRBench_MINI":{"Qwen2-VL-7B-Instruct":16.6,"InternVL2_5-8B":16.4,"llava_onevision_qwen2_7b_si":12.9}}'
17 | 
18 | jobs:
19 |   vlm_test:
20 |     if: ${{!cancelled()}}
21 |     runs-on: [linux-a100]
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         model: [Qwen/Qwen2-VL-7B-Instruct,OpenGVLab/InternVL2_5-8B,lmms-lab/llava-onevision-qwen2-7b-si]
26 |         dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
27 |     container:
28 |       image: kkscilife/vlmevalkit_2:a100
29 |       options: "--gpus=all --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy --pull never"
30 |       volumes:
31 |         - /mnt/187:/mnt/187
32 |     steps:
33 |       - name: clone_repo
34 |         uses: actions/checkout@v3
35 |       - name: evaluation_model
36 |         run: |
37 |           pip install -e .
38 |           pre_model=$(echo ${{matrix.model}} | awk -F'/' '{print $1}')
39 |           ln -s /mnt/187/$pre_model .
40 |           if [ "${{matrix.model}}" = "lmms-lab/llava-onevision-qwen2-7b-si" ];then
41 |               model_name="llava_onevision_qwen2_7b_si"
42 |           else
43 |               model_name=$(echo ${{matrix.model}} | awk -F'/' '{print $2}')
44 |           fi
45 |           nvidia-smi
46 |           python run.py --data ${{matrix.dataset}} --model $model_name
47 |           python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name $model_name
48 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | # added by Anita Rau April 2025
 2 | 
 3 | from vlmeval.config import supported_VLM, model_map, data_map, ShellModel
 4 | from vlmeval.prompts import get_prompts
 5 | import hydra
 6 | from omegaconf import DictConfig
 7 | 
 8 | 
 9 | @hydra.main(version_base=None, config_path="config", config_name="config")
10 | def main(cfg: DictConfig):
11 |     if 'CLIP' in cfg.model.name or 'SurgVLP' in cfg.model.name:
12 |         model = supported_VLM[cfg.model.name](eval_type=cfg.task['clip_eval_mode'])
13 |     elif 'eval' in cfg.eval_mode:
14 |         model = ShellModel(name=cfg.model.name)  # No need to load model weights for eval only
15 |     else:
16 |         model = supported_VLM[cfg.model.name]()
17 |         
18 |     if not hasattr(model, 'name'):
19 |         model.name = cfg.model.name
20 |     dataset = data_map.get(cfg.task.data)(config=cfg.task, split='test')
21 | 
22 |     # Choose zero or few shot prompt
23 |     if hasattr(cfg.task, 'shots') and cfg.task.shots != 'zero':
24 |         shots = cfg.task.shots
25 |         task_name = cfg.task.name.replace('few', f'{shots}')
26 |         prompt = get_prompts(cfg.task.data_config.data_dir, task_name, cfg.model.name)
27 |     else:
28 |         if cfg.task.name == 'intermountain_skill_assessment':
29 |             cfg.task.name = f'{cfg.task.name}_{cfg.task.data_config.category}'
30 |         prompt = get_prompts(cfg.task.data_config.data_dir, cfg.task.name, cfg.model.name)
31 | 
32 |     if cfg.model.contrastive:
33 |         cfg.eval_mode = f'{cfg.eval_mode}_contrastive'
34 |     elif (  # video tasks
35 |         'error_recognition' in cfg.task.name or \
36 |         'error_detection' in cfg.task.name or \
37 |         'skill_assessment' in cfg.task.name 
38 |     ):
39 |         cfg.eval_mode = f'{cfg.eval_mode}_video'
40 | 
41 |     model_map.get(cfg.eval_mode)(model,
42 |                                    cfg.workdir,
43 |                                    cfg.exp_name,
44 |                                    dataset,
45 |                                    cfg.task,
46 |                                    prompt,
47 |                                    override_outputs=cfg.override_outputs
48 |                                    )
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import copy as cp
 3 | import os
 4 | from ..smp import *
 5 | 
 6 | 
 7 | def can_infer_option(answer, choices):
 8 |     verbose = os.environ.get('VERBOSE', 0)
 9 |     # Choices is a dictionary
10 |     if 'Failed to obtain answer via API' in answer:
11 |         return False
12 | 
13 |     reject_to_answer = [
14 |         "Sorry, I can't help with images of people yet.",
15 |         "I can't process this file.",
16 |         "I'm sorry, but without the image provided",
17 |         'Cannot determine the answer'
18 |     ]
19 |     for err in reject_to_answer:
20 |         if err in answer:
21 |             return 'Z'
22 | 
23 |     def count_choice(splits, choices, prefix='', suffix=''):
24 |         cnt = 0
25 |         for c in choices:
26 |             if prefix + c + suffix in splits:
27 |                 cnt += 1
28 |         return cnt
29 | 
30 |     answer_mod = cp.copy(answer)
31 |     chars = '.()[],:;!*#{}'
32 |     for c in chars:
33 |         answer_mod = answer_mod.replace(c, ' ')
34 | 
35 |     splits = [x.strip() for x in answer_mod.split()]
36 |     count = count_choice(splits, choices)
37 | 
38 |     if count == 1:
39 |         for ch in choices:
40 |             if 'A' in splits and len(splits) > 3 and verbose:
41 |                 logger = get_logger('Evaluation')
42 |                 logger.info(f'A might be a quantifier in the string: {answer}.')
43 |                 return False
44 |             if ch in splits:
45 |                 return ch
46 |     elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 |         return 'Z'
48 |     return False
49 | 
50 | 
51 | def can_infer_text(answer, choices):
52 |     answer = answer.lower()
53 |     assert isinstance(choices, dict)
54 |     for k in choices:
55 |         assert k in string.ascii_uppercase
56 |         choices[k] = str(choices[k]).lower()
57 |     cands = []
58 |     for k in choices:
59 |         if choices[k] in answer:
60 |             cands.append(k)
61 |     if len(cands) == 1:
62 |         return cands[0]
63 |     return False
64 | 
65 | 
66 | def can_infer(answer, choices):
67 |     answer = str(answer)
68 |     copt = can_infer_option(answer, choices)
69 |     return copt if copt else can_infer_text(answer, choices)
70 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/sequence_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import str_to_list
 2 | from numbers import Number
 3 | 
 4 | 
 5 | class SequenceEquality:
 6 |     """Determines how much of the first part of the list
 7 |     was predicted correctly.
 8 |     """
 9 | 
10 |     @classmethod
11 |     def match(cls, responses, targets) -> int:
12 |         """Exact match between targets and responses."""
13 |         if not isinstance(responses, str):
14 |             responses = str(responses)
15 |         responses = str_to_list(responses)
16 |         targets = str_to_list(targets)
17 |         return 1 if responses == targets else 0
18 | 
19 | 
20 | class SequenceEqualityCaseInsensitive:
21 |     """Determines how much of the first part of the list
22 |     was predicted correctly.
23 |     """
24 | 
25 |     @classmethod
26 |     def match(cls, responses, targets) -> int:
27 |         """Exact match between targets and responses."""
28 |         if not isinstance(responses, str):
29 |             responses = str(responses)
30 |         responses = str_to_list(responses)
31 |         targets = str_to_list(targets)
32 | 
33 |         responses = [
34 |             item.lower() if isinstance(item, str) else str(item) for item in responses
35 |         ]
36 |         targets = [item.lower() for item in targets]
37 |         return 1 if responses == targets else 0
38 | 
39 | 
40 | class SequenceAccuracyCaseInsensitive:
41 |     """Determines how much of the first part of the list
42 |     was predicted correctly.
43 |     """
44 | 
45 |     @classmethod
46 |     def match(cls, responses, targets) -> int:
47 |         """Exact match between targets and responses."""
48 |         responses = str_to_list(responses)
49 |         targets = str_to_list(targets)
50 |         if len(targets) != len(responses):
51 |             return 0
52 |         correct = 0
53 |         for res, tgt in zip(responses, targets):
54 |             if isinstance(tgt, str):
55 |                 if res.lower() == tgt.lower():
56 |                     correct += 1
57 |             elif isinstance(tgt, Number) and isinstance(res, Number):
58 |                 if res == tgt:
59 |                     correct += 1
60 |             else:
61 |                 pass
62 |         return correct / len(targets)
63 | 


--------------------------------------------------------------------------------
/docs/en/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
 3 |   <defs>
 4 |     <style>
 5 |       .cls-1 {
 6 |         fill: #36569b;
 7 |       }
 8 | 
 9 |       .cls-2 {
10 |         fill: #1b3882;
11 |       }
12 | 
13 |       .cls-3 {
14 |         fill: #5878b4;
15 |       }
16 |     </style>
17 |   </defs>
18 |   <g id="_图层_1-2" data-name="图层 1">
19 |     <g>
20 |       <g id="_3" data-name="3">
21 |         <path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
22 |       </g>
23 |       <g id="_2" data-name="2">
24 |         <path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
25 |       </g>
26 |       <g id="_1" data-name="1">
27 |         <path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
28 |       </g>
29 |     </g>
30 |   </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
 3 |   <defs>
 4 |     <style>
 5 |       .cls-1 {
 6 |         fill: #36569b;
 7 |       }
 8 | 
 9 |       .cls-2 {
10 |         fill: #1b3882;
11 |       }
12 | 
13 |       .cls-3 {
14 |         fill: #5878b4;
15 |       }
16 |     </style>
17 |   </defs>
18 |   <g id="_图层_1-2" data-name="图层 1">
19 |     <g>
20 |       <g id="_3" data-name="3">
21 |         <path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
22 |       </g>
23 |       <g id="_2" data-name="2">
24 |         <path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
25 |       </g>
26 |       <g id="_1" data-name="1">
27 |         <path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
28 |       </g>
29 |     </g>
30 |   </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/scripts/apires_scan.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from vlmeval import *
 3 | from vlmeval.dataset import SUPPORTED_DATASETS
 4 | FAIL_MSG = 'Failed to obtain answer via API.'
 5 | 
 6 | root = sys.argv[1]
 7 | if root[-1] in '/\\':
 8 |     root = root[:-1]
 9 | 
10 | model_name = root.split('/')[-1]
11 | 
12 | for d in SUPPORTED_DATASETS:
13 |     fname = f'{model_name}_{d}.xlsx'
14 |     pth = osp.join(root, fname)
15 |     if osp.exists(pth):
16 |         data = load(pth)
17 |         # Detect Failure
18 |         assert 'prediction' in data
19 |         data['prediction'] = [str(x) for x in data['prediction']]
20 |         fail = [FAIL_MSG in x for x in data['prediction']]
21 |         if sum(fail):
22 |             nfail = sum(fail)
23 |             ntot = len(fail)
24 |             print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
25 | 
26 |         eval_files = ls(root, match=f'{model_name}_{d}_')
27 |         eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')]
28 | 
29 |         if len(eval_files) == 0:
30 |             print(f'Model {model_name} x Dataset {d} openai missing')
31 |             continue
32 |         
33 |         assert len(eval_files) == 1
34 |         eval_file = eval_files[0]
35 |         data = load(eval_file)
36 |         
37 |         if 'MMVet' in d:
38 |             bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
39 |             if len(bad):
40 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
41 |         elif 'MathVista' in d:
42 |             bad = [x for x in data['res'] if FAIL_MSG in str(x)]
43 |             if len(bad):
44 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
45 |             
46 |         elif d == 'LLaVABench':
47 |             sub = data[data['gpt4_score'] == -1]
48 |             sub = sub[sub['gpt4_score'] == -1]
49 |             if len(sub):
50 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
51 |         else:
52 |             bad = [x for x in data['log'] if FAIL_MSG in str(x)]
53 |             if len(bad):
54 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
55 |                 


--------------------------------------------------------------------------------
/vlmeval/dataset/endoscapes_cvs_assessment.py:
--------------------------------------------------------------------------------
 1 |  # added by Anita Rau April 2025
 2 | 
 3 | from torch.utils.data import Dataset
 4 | import os
 5 | from PIL import Image
 6 | import pandas as pd
 7 | 
 8 | class EndoscapesCVSAssessment(Dataset):
 9 |     def __init__(self, config, split):
10 |         assert split in ['train', 'val', 'test']
11 |         self.data_dir = config['data_config']['data_dir']
12 |         self.image_dir = os.path.join(config['data_config']['data_dir'], split)
13 |         self.ann_path = os.path.join(self.data_dir , 'all_metadata.csv')
14 |         self.split = split
15 |         self.few_shot = True if config['shots'] != 'zero' else False
16 |         self.labels = self.load_labels()
17 |         self.labels = self.labels[:10] #TODO delete this for debugging
18 |         print(' ')
19 | 
20 |     def load_labels(self):
21 |         labels = []
22 |         fps_rate = 1
23 |         df = pd.read_csv(self.ann_path)
24 |         df = df.dropna(subset=['cvs_annotator_1'])  # drop rows with no annotation. Dataset interpolates between annotations, which can be used for training but not testing
25 | 
26 |         # list videos in split
27 |         split_videos = [int(float(vid)) for vid in open(os.path.join(self.data_dir, self.split + '_vids.txt')).read().splitlines()]
28 | 
29 |         
30 |         label_columns = ['C1', 'C2', 'C3']
31 |         for idx, row in df.iterrows():
32 |             video_name = str(row['vid'])
33 |             if int(video_name) not in split_videos:
34 |                 continue
35 |             if self.few_shot:
36 |                 fps_rate = 2
37 |             if row['frame'] % fps_rate != 0:
38 |                 continue    
39 |             frame_name = str(row['frame'])
40 |             frame_label = row[label_columns].astype(float).values.round(0) # Round CVS values at test time to get majority vote of three raters. output: array([0., 0., 0.])
41 |             frame_path = os.path.join(self.image_dir, video_name + '_' + frame_name + '.jpg')
42 |             labels.append((frame_path, frame_label))
43 |         return labels
44 | 
45 |     def __len__(self):
46 |         return len(self.labels)
47 | 
48 |     def __getitem__(self, idx):
49 |         frame_name, frame_label = self.labels[idx]
50 |         frame = {'path': frame_name}
51 |         return (
52 |             frame,
53 |             frame_label  # [C1, C2, C3]
54 |         )
55 | 


--------------------------------------------------------------------------------
/.github/scripts/assert_score.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import ast
 3 | import json
 4 | import os
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def validate_scores(dataset_list, assert_score, model_name):
10 |     for dataset in dataset_list:
11 |         base_score = assert_score[dataset][model_name]
12 |         if dataset == "OCRBench_MINI":
13 |             score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json")
14 |             cur_score = 0
15 |             with open(score_file, "r") as f:
16 |                 total_score = json.load(f)
17 |                 cur_score = total_score["Final Score Norm"]
18 |             assert (
19 |                 abs(cur_score - float(base_score)) <= 0.01
20 |             ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
21 |         else:
22 |             score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv")
23 |             df = pd.read_csv(score_file)
24 |             cur_score = df["Overall"].iloc[0]
25 |             if dataset == "MMBench_V11_MINI":
26 |                 cur_score = df.loc[df["split"] == "dev", "Overall"].values
27 |             assert (
28 |                 abs(cur_score - float(base_score)) <= 0.01
29 |             ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
30 |         print(f"cur_score is {cur_score}, base_score is {base_score}")
31 | 
32 | 
33 | def parse_arguments():
34 |     parser = argparse.ArgumentParser(description="Validate model scores against csv/json data")
35 | 
36 |     parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets")
37 | 
38 |     parser.add_argument(
39 |         "--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}"
40 |     )
41 | 
42 |     parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate")
43 | 
44 |     return parser.parse_args()
45 | 
46 | 
47 | def main():
48 |     args = parse_arguments()
49 | 
50 |     try:
51 |         dataset_list = args.dataset.split()
52 |         base_score = ast.literal_eval(args.base_score)
53 |     except Exception as e:
54 |         print(f"Parameter parsing error: {str(e)}")
55 |         return
56 | 
57 |     validate_scores(dataset_list, base_score, args.model_name)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/cholec80_tool_recognition.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Cholec80ToolRecognition(Dataset):
 7 |     def __init__(self, config, split):
 8 |         assert split in ['train', 'val', 'test']
 9 |         self.image_dir = os.path.join(config['data_config']['data_dir'], 'frames_25fps', split)
10 |         self.data_dir = config['data_config']['data_dir']
11 |         self.map = {tool: idx for idx, tool in enumerate(config['label_names'])}
12 | 
13 |         self.split = split
14 |         self.few_shot = True if config['shots'] != 'zero' else False
15 |         self.labels = self.load_labels()
16 |     
17 |     def load_labels(self):
18 |         labels = []
19 |         fps_rate = 25
20 |         if self.split == 'test':
21 |             fps_rate = fps_rate * 5
22 |             if self.few_shot:
23 |                 fps_rate = fps_rate * 15
24 |         for video_name in os.listdir(self.image_dir):
25 |             video_labels_path = os.path.join(self.data_dir, 'tool_annotations', f'{video_name}-tool.txt')
26 |             with open(video_labels_path, 'r') as f:
27 |                 lines = f.readlines()
28 |                 for line in lines:
29 |                     line = line.strip().split()
30 |                     if line[0] == 'Frame':
31 |                         tools = line[1:]
32 |                     else:
33 |                         if int(line[0]) % fps_rate != 0:
34 |                             continue  # only sample 1/5 frame per sec
35 |                         frame_label = [None] * len(self.map)
36 |                         for i, tool in enumerate(tools):
37 |                             frame_label[self.map[tool]] = int(line[i+1])
38 |                         frame_name = f'{line[0]}.jpg'
39 |                         frame_label = np.array(frame_label, dtype=np.float32)
40 |                         frame_path = os.path.join(self.image_dir, video_name, frame_name)
41 |                         labels.append((frame_path, frame_label))
42 |         labels = labels[:10]  # TODO remove this line - for debugging
43 |         return labels
44 |     
45 |     def __len__(self):
46 |         return len(self.labels)
47 |     
48 |     def __getitem__(self, idx):
49 |         frame_name, frame_label = self.labels[idx]
50 | 
51 |         frame = {'path': frame_name}
52 |         return (
53 |             frame,
54 |             frame_label
55 |         )
56 |     


--------------------------------------------------------------------------------
/vlmeval/dataset/dresden_anatomy_presence.py:
--------------------------------------------------------------------------------
 1 |  # added by Anita Rau April 2025
 2 | 
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | import os
 6 | from PIL import Image
 7 | import numpy as np
 8 | 
 9 | 
10 | class DresdenAnatomyPresence(Dataset):
11 |     def __init__(self, config, split):
12 |         assert split in ['train', 'val', 'test']
13 |         if split == 'train':
14 |             folders = [1, 4, 5, 6, 8, 9, 10, 12, 15, 16, 17, 19, 22, 23, 24, 25, 27, 28, 29, 30, 31]  # official split: https://www.medrxiv.org/content/10.1101/2022.11.11.22282215v5.full.pdf
15 |         elif split == 'val':
16 |             folders = [3, 21, 26]  # official split
17 |         else:
18 |             folders = [2, 7, 11, 13, 18, 20, 32]  # official split (14 should be part of test set; however, the annotations are missing from official dataset)
19 | 
20 |         self.image_dirs = [
21 |             os.path.join(config['data_config']['data_dir'], anatomy.replace(' ','_'), str(folder).zfill(2))
22 |             for anatomy in config['label_names']
23 |             for folder in folders
24 |             if os.path.exists(os.path.join(config['data_config']['data_dir'], anatomy.replace(' ','_'), str(folder).zfill(2)))
25 |         ]       
26 |         self.data_dir = config['data_config']['data_dir']
27 |         self.anatomy_map = {v: idx for idx, v in enumerate(config['label_names'])}
28 |         self.labels = self.load_labels()
29 |         self.labels = self.labels[:10]  # TODO delete this, this is for debugging
30 |     
31 |     def load_labels(self):
32 |         labels = []
33 |         for folder in self.image_dirs:
34 |             labels_path = os.path.join(self.data_dir, folder, 'weak_labels.csv')
35 |             with open(labels_path, 'r') as f:
36 |                 lines = f.readlines()
37 |                 for line in lines:
38 |                     line = line.strip(',\n').split(',')
39 |                     frame_name = os.path.join(self.data_dir, folder,line[0].replace('images0', 'image'))  # naming mismatch
40 |                     anatomies = line[1:]
41 |                     labels.append((frame_name, np.array(anatomies, dtype=np.int32)))
42 |         return labels
43 |     
44 |     def __len__(self):
45 |         return len(self.labels)
46 |     
47 |     def __getitem__(self, idx):
48 |         frame_name, frame_label = self.labels[idx]
49 |         frame = {'path': frame_name}
50 |         return (
51 |             frame,
52 |             frame_label
53 |         )
54 |     


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/arguments.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import Optional
 5 | 
 6 | 
 7 | @dataclass
 8 | class ModelArguments:
 9 |     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
10 |     version: Optional[str] = field(default="v0")
11 |     freeze_backbone: bool = field(default=False)
12 |     tune_speech_projector: bool = field(default=False)
13 |     tune_speech_encoder: bool = field(default=False)
14 |     tune_speech_generator_only: bool = field(default=False)
15 |     speech_encoder_type: Optional[str] = field(default=None)
16 |     speech_encoder: Optional[str] = field(default=None)
17 |     pretrain_speech_projector: Optional[str] = field(default=None)
18 |     speech_projector_type: Optional[str] = field(default='linear')
19 |     speech_encoder_ds_rate: int = 5
20 |     speech_encoder_hidden_size: int = 1280
21 | 
22 | 
23 | @dataclass
24 | class DataArguments:
25 |     data_path: str = field(default=None,
26 |                            metadata={"help": "Path to the training data."})
27 |     is_multimodal: bool = False
28 |     input_type: str = field(default="mel")
29 |     speech_normalize: bool = False
30 |     mel_size: int = 128
31 |     has_tgt_units: bool = False
32 | 
33 | 
34 | @dataclass
35 | class TrainingArguments(transformers.TrainingArguments):
36 |     cache_dir: Optional[str] = field(default=None)
37 |     optim: str = field(default="adamw_torch")
38 |     freeze_speech_projector: bool = field(default=False)
39 |     model_max_length: int = field(
40 |         default=512,
41 |         metadata={
42 |             "help":
43 |             "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
44 |         },
45 |     )
46 |     double_quant: bool = field(
47 |         default=True,
48 |         metadata={"help": "Compress the quantization statistics through double quantization."}
49 |     )
50 |     quant_type: str = field(
51 |         default="nf4",
52 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
53 |     )
54 |     bits: int = field(
55 |         default=16,
56 |         metadata={"help": "How many bits to use."}
57 |     )
58 |     lora_enable: bool = False
59 |     lora_r: int = 64
60 |     lora_alpha: int = 16
61 |     lora_dropout: float = 0.05
62 |     lora_weight_path: str = ""
63 |     lora_bias: str = "none"
64 |     speech_projector_lr: Optional[float] = None
65 |     group_by_modality_length: bool = field(default=False)


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_projector/pooler_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import math
 5 | 
 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel
 7 | import os
 8 | 
 9 | 
10 | class PoolerProjector(nn.Module):
11 |     def __init__(self, config, vision_cfg):
12 |         super().__init__()
13 |         self._config = config
14 |         self.hw = vision_cfg.image_size // vision_cfg.patch_size
15 | 
16 |         self.conv_pool = nn.Conv2d(
17 |             config.mm_hidden_size, config.hidden_size,
18 |             kernel_size=2, stride=2
19 |         )
20 | 
21 |         self.proj = nn.Sequential(
22 |             nn.GELU(),
23 |             nn.Linear(config.hidden_size, config.hidden_size),
24 |         )
25 | 
26 |     def forward(self, x, *args, **kwargs):
27 |         height = width = self.hw
28 |         assert height * width == x.shape[1]
29 |         x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
30 |         x = self.conv_pool(x)
31 |         x = x.flatten(2).transpose(1, 2)
32 |         x = self.proj(x)
33 |         return x
34 | 
35 |     @property
36 |     def config(self):
37 |         return {"mm_projector_type": 'pooler'}
38 | 
39 | 
40 | class NormalizedDwPooler(nn.Module):
41 |     def __init__(self, dim):
42 |         super().__init__()
43 |         self.dim = dim
44 |         self.predictor = nn.Sequential(
45 |             nn.Linear(dim*2, dim),
46 |             nn.GELU(),
47 |             nn.Linear(dim, dim),
48 |         )
49 |     
50 |     def forward(self, x, forward_type='2x'):
51 |         B, H, W, C = x.shape
52 | 
53 |         if forward_type == '2x':
54 |             new_x = x.reshape(B, H//2, 2, W//2, 2, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//2, W//2, 4, C)
55 |             pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 4, -1)
56 |             fused_x = torch.cat([new_x, pooled_x], dim=-1)
57 |         elif forward_type == '1x':
58 |             new_x = x.reshape(B, H, W, 1, C)
59 |             fused_x = torch.cat([new_x, new_x], dim=-1)
60 |         elif forward_type == '4x':
61 |             new_x = x.reshape(B, H//4, 4, W//4, 4, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//4, W//4, 16, C)
62 |             pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 16, -1)
63 |             fused_x = torch.cat([new_x, pooled_x], dim=-1)
64 |         
65 |         score = self.predictor(fused_x)
66 |         normalized_score = F.softmax(score, dim=-2)
67 |         new_x = (new_x * normalized_score).sum(dim=-2)
68 |         return new_x
69 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from mimetypes import guess_type
 3 | 
 4 | 
 5 | def lazy_import(module_name, class_name):
 6 |     """Import the module lazily."""
 7 | 
 8 |     def importer():
 9 |         module = importlib.import_module(module_name)
10 |         return getattr(module, class_name)
11 | 
12 |     return importer
13 | 
14 | 
15 | def is_video_file(file_path):
16 |     mime_type, _ = guess_type(file_path)
17 |     if not mime_type:
18 |         return False
19 |     return mime_type.startswith("video")
20 | 
21 | 
22 | 
23 | def prepare_megabench_data(dataset_name, dataset_subset_name):
24 |     """
25 |         Prepare the MEGA-Bench dataset for evaluation.
26 |         Return:
27 |             subset_dataset: The organized data of the specified subset
28 |             all_dataset: The organized data of all tasks, used for evaluation
29 |     """
30 |     from datasets import load_dataset
31 |     if "single_image" in dataset_subset_name:
32 |         core_data = load_dataset(dataset_name, "core_single_image")
33 |         open_data = load_dataset(dataset_name, "open_single_image")
34 |     else:
35 |         core_data = load_dataset(dataset_name, "core")
36 |         open_data = load_dataset(dataset_name, "open")
37 |     core_test_samples = list(core_data["test"])
38 |     organized_core_dataset = organize_hf_dataset(core_test_samples)
39 |     open_test_samples = list(open_data["test"])
40 |     organized_open_dataset = organize_hf_dataset(open_test_samples)
41 |     subset_dataset = organized_core_dataset if "core" in dataset_subset_name else organized_open_dataset
42 |     all_dataset = organized_core_dataset + organized_open_dataset
43 |     return subset_dataset, all_dataset
44 | 
45 | 
46 | def organize_hf_dataset(dataset):
47 |     """
48 |     Organize the dataset with task-based manner
49 | 
50 |     Return:
51 |         organized_dataset: list, each item is a dict, with the following keys:
52 |             - task_name: str
53 |             - task_query_samples: list of dicts, each dict contains the sample information
54 |     """
55 |     task_dict = {}
56 |     for sample in dataset:
57 |         task_name = sample["task_name"]
58 |         if task_name not in task_dict:
59 |             task_dict[task_name] = []
60 |         task_dict[task_name].append(sample)
61 | 
62 |     organized_dataset = []
63 |     for task_name, samples in task_dict.items():
64 |         organized_dataset.append({
65 |             "task_name": task_name,
66 |             "task_samples": samples
67 |         })
68 | 
69 |     return organized_dataset
70 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/pandagpt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import os.path as osp
 4 | import warnings
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class PandaGPT(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name, root=None, **kwargs):
15 |         if root is None:
16 |             raise ValueError('Please set `root` to PandaGPT code directory, which is cloned from here: ')
17 | 
18 |         assert name == 'PandaGPT_13B'
19 |         self.name = name
20 |         sys.path.append(osp.join(root, 'code'))
21 |         try:
22 |             from model.openllama import OpenLLAMAPEFTModel
23 |         except Exception as e:
24 |             logging.critical(
25 |                 'Please first install PandaGPT and set the root path to use PandaGPT, '
26 |                 'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
27 |             )
28 |             raise e
29 | 
30 |         self.args = {
31 |             'model': 'openllama_peft',
32 |             'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
33 |             'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
34 |             'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
35 |             'stage': 2,
36 |             'max_tgt_len': 512,
37 |             'lora_r': 32,
38 |             'lora_alpha': 32,
39 |             'lora_dropout': 0.1,
40 |         }
41 |         model = OpenLLAMAPEFTModel(**self.args)
42 |         delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
43 |         model.load_state_dict(delta_ckpt, strict=False)
44 |         torch.cuda.empty_cache()
45 |         self.model = model.eval().half().cuda()
46 |         kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
47 |         kwargs_default.update(kwargs)
48 |         self.kwargs = kwargs_default
49 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
53 |         struct = {
54 |             'prompt': prompt,
55 |             'image_paths': [image_path],
56 |             'audio_paths': [],
57 |             'video_paths': [],
58 |             'thermal_paths': [],
59 |             'modality_embeds': []
60 |         }
61 |         struct.update(self.kwargs)
62 |         resp = self.model.generate(struct)
63 |         return resp
64 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/coordinate_sequence_match.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .common.conversions import str_to_coords
 3 | import numpy as np
 4 | 
 5 | 
 6 | class CoordsSequenceSimilarity:
 7 |     """
 8 |     Measure the similarity between two list of coordinates, used for keypoint estimation tasks
 9 |     """
10 | 
11 |     @staticmethod
12 |     def compute_score(pred_keypoints, gt_keypoints, k=10):
13 |         """
14 |         Compute the evaluation score for keypoint estimation.
15 | 
16 |         Args:
17 |             pred_keypoints (list or np.ndarray): List or array of predicted keypoint coordinates,
18 |                                                  each as (x, y), normalized to [0, 1].
19 |             gt_keypoints (list or np.ndarray): List or array of ground truth keypoint coordinates,
20 |                                                each as (x, y), normalized to [0, 1].
21 | 
22 |         Returns:
23 |             float: A score between 0 and 1, where 1 indicates perfect accuracy,
24 |                    and 0 indicates completely wrong.
25 |         """
26 |         # Convert inputs to NumPy arrays
27 |         try:
28 |             pred_keypoints = np.array(pred_keypoints)
29 |         except ValueError:
30 |             # Format is not a correct
31 |             return 0
32 | 
33 |         gt_keypoints = np.array(gt_keypoints)
34 | 
35 |         # shape mismatch, directly assign 0 score
36 |         if pred_keypoints.shape != gt_keypoints.shape:
37 |             return 0
38 | 
39 |         # Compute Euclidean distances between corresponding keypoints
40 |         distances = np.linalg.norm(pred_keypoints - gt_keypoints, axis=1)
41 | 
42 |         # Maximum possible distance in normalized coordinate space
43 |         max_distance = np.sqrt(2)
44 | 
45 |         # Normalize distances
46 |         normalized_distances = distances / max_distance
47 | 
48 |         # Compute per-keypoint scores using exponential decay
49 |         per_keypoint_scores = np.exp(-k * normalized_distances)
50 | 
51 |         # Compute the average score across all keypoints
52 |         score = np.mean(per_keypoint_scores)
53 | 
54 |         return score
55 | 
56 |     @classmethod
57 |     def match(cls, responses, targets) -> float:
58 |         """Exact match between targets and responses."""
59 |         logging.debug(f"{responses=}, {targets=}")
60 |         if not isinstance(responses, (tuple | list)):
61 |             responses = str_to_coords(responses, dim=2)
62 |         if not isinstance(targets, (tuple | list)):
63 |             targets = str_to_coords(targets, dim=2)
64 | 
65 |         return cls.compute_score(responses, targets)
66 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/emma.py:
--------------------------------------------------------------------------------
 1 | from vlmeval import *
 2 | from .image_shortqa import ImageShortQADataset
 3 | from .image_mcq import MMMUDataset
 4 | 
 5 | class EMMADataset(ImageShortQADataset):
 6 | 
 7 |     COT_INST = "Please solve the problem step by step. "
 8 |     DIRECT_INST = "Please ensure that your output only contains the final answer without any additional content (such as intermediate reasoning steps)."
 9 |     MCQ_FMT =  "{context}\n\n{question}\n\n{options}\n\nAnswer with the option's letter from the given choices. "
10 |     OPEN_FMT = "{context}\n\n{question}\n\nAnswer the question using a single word or phrase. "
11 | 
12 |     DATASET_URL = {
13 |         'EMMA': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv',
14 |         'EMMA_COT': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv'
15 |     }
16 |     
17 |     def build_prompt(self, line):
18 |         if isinstance(line, int):
19 |             line = self.data.iloc[line]
20 | 
21 |         if self.meta_only:
22 |             tgt_path = toliststr(line['image_path'])
23 |         else:
24 |             tgt_path = self.dump_image(line)
25 | 
26 |         context = line['context']
27 |         question = line['question']
28 |         example = ""
29 |         res_dict = {}
30 |         if line['type'] == 'MCQ':
31 |             for ch in string.ascii_uppercase:
32 |                 if ch in line and not pd.isna(line[ch]):
33 |                     example += f"{ch}: {line[ch]}\n"
34 |         
35 |             prompt_tmpl = EMMADataset.MCQ_FMT
36 |             if not pd.isna(context) and context is not None:
37 |                 prompt = prompt_tmpl.format(context=context, question=question, options=example)
38 |             else:
39 |                 prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question, options=example)
40 |             prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST
41 |         else:
42 |             prompt_tmpl = EMMADataset.OPEN_FMT
43 |             if not pd.isna(context) and context is not None:
44 |                 prompt = prompt_tmpl.format(context=context, question=question)
45 |             else:
46 |                 prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question)
47 |             prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST
48 | 
49 |         msgs = []
50 |         if isinstance(tgt_path, list):
51 |             msgs.extend([dict(type='image', value=p) for p in tgt_path])
52 |         else:
53 |             msgs = [dict(type='image', value=tgt_path)]
54 |         msgs.append(dict(type='text', value=prompt))
55 |         return MMMUDataset.split_MMMU(msgs)
56 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/video_chatgpt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import warnings
 4 | import copy as cp
 5 | import numpy as np
 6 | import sys
 7 | import logging
 8 | from ..base import BaseModel
 9 | from ...smp import isimg, listinstr
10 | from ...dataset import DATASET_TYPE
11 | from huggingface_hub import snapshot_download
12 | 
13 | 
14 | class VideoChatGPT(BaseModel):
15 |     INSTALL_REQ = True
16 |     INTERLEAVE = False
17 |     VIDEO_LLM = True
18 |     # sample a video in 100 frames
19 | 
20 |     def __init__(self, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=None, **kwargs):
21 |         assert model_path is not None
22 |         sys.path.append(dir_root)
23 |         try:
24 |             from video_chatgpt.eval.model_utils import initialize_model
25 |         except Exception as err:
26 |             logging.critical(
27 |                 'Please first install requirements and set the root path to use Video-ChatGPT. \
28 |                 Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.'
29 |             )
30 |             raise err
31 |         base_model_path = snapshot_download('mmaaz60/LLaVA-7B-Lightening-v1-1')
32 |         projection_path = snapshot_download(model_path)
33 |         projection_name = 'video_chatgpt-7B.bin'
34 |         projection_path = os.path.join(projection_path, projection_name)
35 | 
36 |         model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(
37 |             base_model_path, projection_path
38 |         )
39 |         self.tokenizer = tokenizer
40 |         self.model = model
41 |         self.processor = image_processor
42 |         self.context_len = video_token_len
43 |         self.kwargs = kwargs
44 |         self.vision_tower = vision_tower
45 | 
46 |     def get_model_output(self, model, video_processor, tokenizer, video, qs):
47 |         from video_chatgpt.eval.model_utils import load_video
48 |         from video_chatgpt.inference import video_chatgpt_infer
49 |         conv_mode = 'video-chatgpt_v1'
50 | 
51 |         video_frames = load_video(video)
52 |         # Run inference on the video and questions
53 |         output = video_chatgpt_infer(
54 |             video_frames,
55 |             qs,
56 |             conv_mode,
57 |             model,
58 |             self.vision_tower,
59 |             tokenizer,
60 |             video_processor,
61 |             self.context_len,
62 |         )
63 |         return output
64 | 
65 |     def generate_inner(self, message, dataset=None):
66 |         question, video = self.message_to_promptvideo(message)
67 |         response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
68 |         return response
69 | 


--------------------------------------------------------------------------------
/vlmeval/utils/mp_util.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import os
 3 | from typing import Callable, Iterable, Sized
 4 | 
 5 | from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
 6 |                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
 7 | from rich.text import Text
 8 | import os.path as osp
 9 | import time
10 | import portalocker
11 | from ..smp import load, dump
12 | 
13 | 
14 | def track_progress_rich(
15 |         func: Callable,
16 |         tasks: Iterable = tuple(),
17 |         nproc: int = 1,
18 |         save=None,
19 |         keys=None,
20 |         **kwargs) -> list:
21 | 
22 |     from concurrent.futures import ThreadPoolExecutor
23 |     from tqdm import tqdm
24 |     if save is not None:
25 |         assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
26 |         if not osp.exists(save):
27 |             dump({}, save)
28 |     if keys is not None:
29 |         assert len(keys) == len(tasks)
30 |     if not callable(func):
31 |         raise TypeError('func must be a callable object')
32 |     if not isinstance(tasks, Iterable):
33 |         raise TypeError(
34 |             f'tasks must be an iterable object, but got {type(tasks)}')
35 |     assert nproc > 0, 'nproc must be a positive number'
36 |     res = load(save) if save is not None else {}
37 |     results = [None for _ in range(len(tasks))]
38 | 
39 |     with ThreadPoolExecutor(max_workers=nproc) as executor:
40 |         futures = []
41 | 
42 |         for inputs in tasks:
43 |             if not isinstance(inputs, (tuple, list, dict)):
44 |                 inputs = (inputs, )
45 |             if isinstance(inputs, dict):
46 |                 future = executor.submit(func, **inputs)
47 |             else:
48 |                 future = executor.submit(func, *inputs)
49 |             futures.append(future)
50 | 
51 |         unfinished = set(range(len(tasks)))
52 |         pbar = tqdm(total=len(unfinished))
53 |         while len(unfinished):
54 |             new_finished = set()
55 |             for idx in unfinished:
56 |                 if futures[idx].done():
57 |                     results[idx] = futures[idx].result()
58 |                     new_finished.add(idx)
59 |                     if keys is not None:
60 |                         res[keys[idx]] = results[idx]
61 |             if len(new_finished):
62 |                 if save is not None:
63 |                     dump(res, save)
64 |                 pbar.update(len(new_finished))
65 |                 for k in new_finished:
66 |                     unfinished.remove(k)
67 |             time.sleep(0.1)
68 |         pbar.close()
69 | 
70 |     if save is not None:
71 |         dump(res, save)
72 |     return results
73 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/model/token_compressor/builder.py:
--------------------------------------------------------------------------------
 1 | from .lavit import LavitTokenCompressor
 2 | from .evo import EVOTokenCompressor
 3 | from .avgpool import AvgPoolTokenCompressor
 4 | from .roipool import ROIPoolTokenCompressor
 5 | from .minicpm_resampler import MiniCPMResampler
 6 | from torch import nn
 7 | 
 8 | 
 9 | class TokenCompressorStream(nn.Module):
10 |     def __init__(self, compressor_list, compressor_type_list) -> None:
11 |         super(TokenCompressorStream, self).__init__()
12 |         self.compressor_list = nn.ModuleList(compressor_list)
13 |         self.compressor_type_list = compressor_type_list
14 | 
15 |     def has_type(self, target):
16 |         return target in self.compressor_type_list
17 | 
18 |     def forward(self, x):
19 |         # x can be tensor(B, N, C) or [tensor(N1, C), tensor(N2, C), ...]
20 |         for type, compressor in zip(self.compressor_type_list, self.compressor_list):
21 |             x = compressor(x)
22 | 
23 |         return x
24 | 
25 | 
26 | def build_token_compressor(config) -> nn.Sequential:
27 |     token_compressor_config = config.token_compressor_config
28 | 
29 |     compressor_list = []
30 |     compressor_type_list = []
31 |     for item in token_compressor_config:
32 |         print(item)
33 |         compressor_type = item["type"]
34 |         compressor_params = item["params"]
35 | 
36 |         # build token compressor by compressor type
37 |         if compressor_type == "lavit":
38 |             compressor = LavitTokenCompressor(embed_dim=config.hidden_size, **compressor_params)
39 |         elif compressor_type == "evo":
40 |             compressor = EVOTokenCompressor(embed_dim=config.hidden_size, **compressor_params)
41 |         elif compressor_type == "avgpool":
42 |             compressor = AvgPoolTokenCompressor(**compressor_params)
43 |         elif compressor_type == "roipool":
44 |             compressor = ROIPoolTokenCompressor(**compressor_params)
45 |         elif compressor_type == "minicpm_resampler":
46 |             assert config.mm_projector_type == "identity_patch"
47 |             compressor = MiniCPMResampler(embed_dim=config.hidden_size,
48 |                                           num_heads=config.hidden_size // 128,
49 |                                           kv_dim=config.mm_hidden_size,
50 |                                           **compressor_params)
51 |         else:
52 |             raise ValueError("Unspported Compressor type!")
53 | 
54 |         compressor_list.append(compressor)
55 |         compressor_type_list.append(compressor_type)
56 | 
57 |     print(f"building token compressor done. using: {compressor_type_list}")
58 |     return TokenCompressorStream(compressor_list, compressor_type_list)
59 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/image_caption.py:
--------------------------------------------------------------------------------
 1 | from .image_base import ImageBaseDataset
 2 | from ..smp import *
 3 | 
 4 | 
 5 | class COCO_Caption_Scorer():
 6 |     def __init__(self, ref, gt):
 7 |         from pycocoevalcap.bleu.bleu import Bleu
 8 |         from pycocoevalcap.rouge.rouge import Rouge
 9 |         from pycocoevalcap.cider.cider import Cider
10 | 
11 |         self.ref = ref
12 |         self.gt = gt
13 |         print('setting up scorers...')
14 |         self.scorers = [
15 |             (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
16 |             (Rouge(), 'ROUGE_L'),
17 |             (Cider(), 'CIDEr'),
18 |         ]
19 | 
20 |     def compute_scores(self):
21 |         total_scores = {}
22 |         for scorer, method in self.scorers:
23 |             print('computing %s score...' % (scorer.method()))
24 |             score, scores = scorer.compute_score(self.gt, self.ref)
25 |             if isinstance(method, list):
26 |                 for sc, scs, m in zip(score, scores, method):
27 |                     print('%s: %0.3f' % (m, sc * 100))
28 |                 total_scores['Bleu'] = [x * 100 for x in score]
29 |             else:
30 |                 print('%s: %0.3f' % (method, score * 100))
31 |                 total_scores[method] = score * 100
32 | 
33 |         print('*****DONE*****')
34 |         for key, value in total_scores.items():
35 |             print('{}:{}'.format(key, value))
36 |         return total_scores
37 | 
38 | 
39 | class ImageCaptionDataset(ImageBaseDataset):
40 | 
41 |     TYPE = 'Caption'
42 | 
43 |     DATASET_URL = {
44 |         'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
45 |     }
46 | 
47 |     DATASET_MD5 = {
48 |         'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
49 |     }
50 | 
51 |     def load_data(self, dataset):
52 |         data = super().load_data(dataset)
53 |         if 'question' not in data:
54 |             data['question'] = [(
55 |                 'Please describe this image in general. Directly provide the description, '
56 |                 'do not include prefix like "This image depicts". '
57 |             )] * len(data)
58 |         return data
59 | 
60 |     # It returns a dictionary of scores
61 |     @classmethod
62 |     def evaluate(self, eval_file, **kwargs):
63 |         data = load(eval_file)
64 |         lt = len(data)
65 |         lines = [data.iloc[i] for i in range(lt)]
66 |         ref, gt = {}, {}
67 |         for i, line in enumerate(lines):
68 |             ref[str(i)] = [str(line['prediction'])]
69 |             gt[str(i)] = eval(line['answer'])
70 | 
71 |         scorer = COCO_Caption_Scorer(ref, gt)
72 |         coco_caption_score_dict = scorer.compute_scores()
73 |         score_pth = eval_file.replace('.xlsx', '_score.json')
74 |         dump(coco_caption_score_dict, score_pth)
75 |         return coco_caption_score_dict
76 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/pixtral.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from .base import BaseModel
 4 | from ..smp import *
 5 | import warnings
 6 | from huggingface_hub import snapshot_download
 7 | 
 8 | 
 9 | class Pixtral(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs):
15 | 
16 |         self.model_path = model_path
17 |         try:
18 |             from mistral_inference.transformer import Transformer
19 |             from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
20 |         except ImportError as err:
21 |             logging.critical('Please install `mistral-inference` and `mistral_common`')
22 |             raise err
23 | 
24 |         if os.path.exists(model_path):
25 |             cache_path = model_path
26 |         else:
27 |             if get_cache_path(model_path) is None:
28 |                 snapshot_download(repo_id=model_path)
29 |             cache_path = get_cache_path(self.model_path, repo_type='models')
30 | 
31 |         self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json')
32 |         model = Transformer.from_folder(cache_path, device='cpu')
33 |         model.cuda()
34 |         self.model = model
35 |         self.max_tokens = 2048
36 | 
37 |     def generate_inner(self, message, dataset=None):
38 |         try:
39 |             from mistral_inference.generate import generate
40 |             from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
41 |             from mistral_common.protocol.instruct.request import ChatCompletionRequest
42 |         except ImportError as err:
43 |             logging.critical('Please install `mistral-inference` and `mistral_common`')
44 |             raise err
45 | 
46 |         msg_new = []
47 |         for msg in message:
48 |             tp, val = msg['type'], msg['value']
49 |             if tp == 'text':
50 |                 msg_new.append(TextChunk(text=val))
51 |             elif tp == 'image':
52 |                 b64 = encode_image_file_to_base64(val)
53 |                 image_url = f'data:image/jpeg;base64,{b64}'
54 |                 msg_new.append(ImageURLChunk(image_url=image_url))
55 | 
56 |         completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)])
57 |         encoded = self.tokenizer.encode_chat_completion(completion_request)
58 |         images = encoded.images
59 |         tokens = encoded.tokens
60 | 
61 |         out_tokens, _ = generate(
62 |             [tokens],
63 |             self.model,
64 |             images=[images],
65 |             max_tokens=self.max_tokens,
66 |             temperature=0,
67 |             eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id)
68 | 
69 |         result = self.tokenizer.decode(out_tokens[0])
70 |         return result
71 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/jaccard.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set
 2 | from .common.metrics import jaccard_index
 3 | 
 4 | 
 5 | class Jaccard:
 6 |     """Calculates the Jacard index for iterables."""
 7 | 
 8 |     @classmethod
 9 |     def match(cls, responses, targets) -> float:
10 |         """Exact match between targets and responses."""
11 |         if responses is None:
12 |             return 0
13 |         responses = cast_to_set(responses)
14 |         targets = cast_to_set(targets)
15 | 
16 |         return jaccard_index(responses, targets)
17 | 
18 | 
19 | class JaccardCaseInsensitive:
20 |     """Calculates the Jacard index for iterables of strings,
21 |     Do not consider the case
22 |     """
23 | 
24 |     @classmethod
25 |     def match(cls, responses, targets) -> float:
26 |         """Exact match between targets and responses."""
27 |         if responses is None:
28 |             return 0
29 |         responses = cast_to_set(responses)
30 |         targets = cast_to_set(targets)
31 | 
32 |         if isinstance(list(targets)[0], str):
33 |             new_responses = {
34 |                 item.lower() if isinstance(item, str) else str(item).lower()
35 |                 for item in responses
36 |             }
37 |             new_targets = {item.lower() for item in targets}
38 |         elif isinstance(list(targets)[0], tuple):
39 |             new_responses = set()
40 |             new_targets = set()
41 |             try:
42 |                 for res in responses:
43 |                     new_res = tuple(
44 |                         [
45 |                             item.lower()
46 |                             .replace(" ", "")
47 |                             .replace("-", "")
48 |                             .replace("\n", "")
49 |                             .replace("\t", "")
50 |                             .replace("_", "")
51 |                             .replace(".", "")
52 |                             for item in res
53 |                         ]
54 |                     )
55 |                     new_responses.add(new_res)
56 |             except:  # the data type of the response might be wrong, return 0 in this case
57 |                 return 0
58 |             for tgt in targets:
59 |                 new_tgt = tuple(
60 |                     [
61 |                         item.lower()
62 |                         .replace(" ", "")
63 |                         .replace("-", "")
64 |                         .replace("\n", "")
65 |                         .replace("\t", "")
66 |                         .replace("_", "")
67 |                         .replace(".", "")
68 |                         for item in tgt
69 |                     ]
70 |                 )
71 |                 new_targets.add(new_tgt)
72 |         else:
73 |             return 0
74 | 
75 |         return jaccard_index(new_responses, new_targets)
76 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/clip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import open_clip
 4 | import clip
 5 | 
 6 | class CLIPVLMWrapper(torch.nn.Module):
 7 |     def __init__(self, model_path='openai/clip-vit-base-patch32', **kwargs):
 8 |         super(CLIPVLMWrapper, self).__init__()
 9 |         self.model_path = model_path
10 |         self.eval_type = kwargs['eval_type']
11 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12 |         from transformers import CLIPModel, CLIPProcessor
13 |         self.model = CLIPModel.from_pretrained(model_path)
14 |         self.model.to(self.device)
15 |         self.preprocess = CLIPProcessor.from_pretrained(model_path)
16 |     
17 |     def __call__(self, captions, images):
18 |         images = Image.open(images).convert("RGB")
19 |         inputs = self.preprocess(text=captions, images=images, return_tensors="pt", padding=True, do_rescale=False)
20 |         inputs = {key: value.to(self.device) for key, value in inputs.items()}
21 | 
22 |         with torch.no_grad():
23 |             outputs = self.model(**inputs)
24 | 
25 |         logits = outputs.logits_per_image  # this is the image-text similarity score
26 |         if self.eval_type == 'singlelabel':
27 |             probs = logits.softmax(dim=1)
28 |         elif self.eval_type == 'sigmoid':
29 |             probs = (logits/100).sigmoid()
30 |         return probs.cpu().numpy()
31 |     
32 | 
33 | 
34 | class CLIPOpenAIWrapper(torch.nn.Module):
35 |     def __init__(self, model_path='ViT-B/32', **kwargs):
36 |         super(CLIPOpenAIWrapper, self).__init__()
37 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38 | 
39 |         self.eval_type = kwargs['eval_type']
40 |         self.model, _, self.preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained='laion2b_s32b_b79k')
41 |         self.model_path = model_path
42 |         self.model.to(self.device)
43 |     
44 |     def __call__(self, captions, images):
45 |         images = Image.open(images).convert("RGB")
46 |         images = self.preprocess(images).unsqueeze(0).to(self.device)
47 |         text = clip.tokenize(captions).to(self.device)
48 |         with torch.no_grad():
49 |             image_features = self.model.encode_image(images)
50 |             text_features = self.model.encode_text(text)
51 |             image_features /= image_features.norm(dim=-1, keepdim=True)
52 |             text_features /= text_features.norm(dim=-1, keepdim=True)
53 |             similarity = (100.0 * image_features @ text_features.T)
54 |             if self.eval_type == 'singlelabel':
55 |                 probs = similarity.softmax(dim=-1)
56 |             elif self.eval_type == 'sigmoid':
57 |                 probs = (similarity/100).sigmoid()
58 |             elif self.eval_type == 'negative_examples':
59 |                 probs = similarity
60 |             return probs.cpu().numpy()
61 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/wemm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import sys
 4 | from ..smp import *
 5 | from .base import BaseModel
 6 | from ..dataset import DATASET_TYPE
 7 | from transformers import AutoModel, GenerationConfig
 8 | 
 9 | 
10 | class WeMM(BaseModel):
11 |     def __init__(self, model_path='feipengma/WeMM', **kwargs):
12 |         self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
13 |         self.wemm.cuda()
14 |         self.wemm.eval()
15 |         torch.cuda.empty_cache()
16 | 
17 |     def use_custom_prompt(self, dataset):
18 |         assert dataset is not None
19 |         if DATASET_TYPE(dataset) == 'MCQ':
20 |             return True
21 |         return False
22 | 
23 |     def build_prompt(self, line, dataset=None):
24 |         assert self.use_custom_prompt(dataset)
25 |         assert dataset is None or isinstance(dataset, str)
26 |         tgt_path = self.dump_image(line, dataset)
27 |         question = line['question']
28 |         hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
29 |         if hint is not None:
30 |             question = hint + '\n' + question
31 |         options = {
32 |             cand: line[cand]
33 |             for cand in string.ascii_uppercase
34 |             if cand in line and not pd.isna(line[cand])
35 |         }
36 |         for key, item in options.items():
37 |             question += f'\n{key}. {item}'
38 |         prompt = question
39 | 
40 |         if len(options):
41 |             prompt += (
42 |                 '\n请直接回答选项字母。' if cn_string(prompt) else
43 |                 "\nAnswer with the option's letter from the given choices directly."
44 |             )
45 |         else:
46 |             prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
47 | 
48 |         message = [dict(type='text', value=prompt)]
49 |         message.extend([dict(type='image', value=p) for p in tgt_path])
50 |         return message
51 | 
52 |     def generate_inner(self, message, dataset=None):
53 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
54 | 
55 |         if dataset == 'HallusionBench':
56 |             prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
57 | 
58 |         gen_config = None
59 |         if dataset == 'MMVet':
60 |             gen_config = GenerationConfig(
61 |                 max_new_tokens=512,
62 |                 do_sample=True,
63 |                 temperatures=0.7,
64 |                 num_beams=3,
65 |                 eos_token_id=self.wemm.tokenizer.eos_token_id,
66 |                 pad_token_id=self.wemm.tokenizer.pad_token_id
67 |                 if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
68 |             )
69 |         pred = self.wemm.mm_generate(image_path, prompt, gen_config)
70 | 
71 |         return pred
72 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/longvideobench.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | from .multiple_choice import extract_answer_from_item
 3 | import numpy as np
 4 | import re
 5 | 
 6 | FAIL_MSG = 'Failed to obtain answer via API.'
 7 | 
 8 | DURATIONS = [15, 60, 600, 3600]
 9 | TASK_CATEGORIES = [
10 |     "S2E", "S2O", "S2A",
11 |     "E2O", "O2E", "T2E",
12 |     "T2O", "T2A", "E3E",
13 |     "O3O", "SSS", "SOS",
14 |     "SAA", "T3E", "T3O",
15 |     "TOS", "TAA"
16 | ]
17 | 
18 | 
19 | def get_dimension_rating(data_path):
20 |     data = load(data_path)
21 |     print(data.iloc[0])
22 | 
23 |     duration_rating = {k: {} for k in DURATIONS}
24 |     for duration in DURATIONS + ['overall']:
25 |         duration_rating[duration] = {
26 |             'overall': '',
27 |             'question_category': {k: [] for k in TASK_CATEGORIES}
28 |         }
29 | 
30 |     for i in range(len(data)):
31 | 
32 |         task_ctg = data.iloc[i]['question_category']
33 | 
34 |         duration = data.iloc[i]['duration_group']
35 |         duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
36 | 
37 |         duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
38 | 
39 |     for duration in DURATIONS + ['overall']:
40 |         overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}'  # noqa: E501
41 |         duration_rating[duration]['overall'] = overall_res_dur
42 |         for task_ctg in TASK_CATEGORIES:
43 |             task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}'  # noqa: E501
44 |             duration_rating[duration]['question_category'][task_ctg] = task_res_dur
45 | 
46 |     return duration_rating
47 | 
48 | 
49 | def extract_option(model, input_item, dataset_name):
50 |     options = input_item['question'].split('\n')[1:]
51 |     for id, option in enumerate(options):
52 |         option_id = chr(ord('A') + id) + '.'
53 |         if option.find(option_id) >= 0:
54 |             input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
55 |     return extract_answer_from_item(model, input_item, dataset_name)['opt']
56 | 
57 | 
58 | def extract_characters_regex(s):
59 |     s = s.strip()
60 |     answer_prefixes = [
61 |         'The best answer is',
62 |         'The correct answer is',
63 |         'The answer is',
64 |         'The answer',
65 |         'The best option is'
66 |         'The correct option is',
67 |         'Best answer:'
68 |         'Best option:',
69 |         'Answer:',
70 |         'Option:',
71 |     ]
72 |     for answer_prefix in answer_prefixes:
73 |         s = s.replace(answer_prefix, '')
74 | 
75 |     if len(s.split()) > 10 and not re.search('[ABCDE]', s):
76 |         return ''
77 |     matches = re.search(r'[ABCDE]', s)
78 |     if matches is None:
79 |         return ''
80 |     return matches[0]
81 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/ursa_model/processing_ursa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates 
 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 
 3 | # you may not use this file except in compliance with the License. 
 4 | # You may obtain a copy of the License at 
 5 | 
 6 | #     http://www.apache.org/licenses/LICENSE-2.0 
 7 | 
 8 | # Unless required by applicable law or agreed to in writing, software 
 9 | # distributed under the License is distributed on an "AS IS" BASIS, 
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
11 | # See the License for the specific language governing permissions and 
12 | # limitations under the License. 
13 | 
14 | from typing import List, Optional, Union
15 | 
16 | from transformers.feature_extraction_utils import BatchFeature
17 | from transformers.image_utils import ImageInput
18 | from transformers.processing_utils import ProcessorMixin
19 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
20 | from transformers.utils import TensorType
21 | 
22 | 
23 | class UrsaProcessor(ProcessorMixin):
24 |     attributes = ["image_processor", "tokenizer"]
25 |     valid_kwargs = ["chat_template"]
26 |     image_processor_class = "AutoImageProcessor"
27 |     tokenizer_class = "AutoTokenizer"
28 | 
29 |     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
30 |         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
31 | 
32 |     def __call__(
33 |         self,
34 |         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
35 |         images: ImageInput = None,
36 |         padding: Union[bool, str, PaddingStrategy] = False,
37 |         truncation: Union[bool, str, TruncationStrategy] = None,
38 |         max_length=None,
39 |         return_tensors: Optional[Union[str, TensorType]] = None,    # or TensorType.PYTORCH
40 |     ) -> BatchFeature:
41 |         image_inputs = {}
42 |         if images is not None:
43 |             image_inputs = self.image_processor(images, return_tensors=return_tensors)
44 |         text_inputs = self.tokenizer(
45 |             text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
46 |         )
47 |         return BatchFeature(data={**text_inputs, **image_inputs})
48 | 
49 |     def decode(self, *args, **kwargs):
50 |         return self.tokenizer.decode(*args, **kwargs)
51 |     
52 |     def batch_decode(self, *args, **kwargs):
53 |         return self.tokenizer.batch_decode(*args, **kwargs)
54 | 
55 |     @property
56 |     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
57 |     def model_input_names(self):
58 |         tokenizer_input_names = self.tokenizer.model_input_names
59 |         image_processor_input_names = self.image_processor.model_input_names
60 |         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
61 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qh_360vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import os.path as osp
 5 | from PIL import Image
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..dataset import DATASET_TYPE
 9 | 
10 | 
11 | class QH_360VL(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
17 |         assert model_path is not None
18 |         self.model_path = model_path
19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20 |         self.model = AutoModelForCausalLM.from_pretrained(model_path,
21 |                                                           torch_dtype=torch.float16,
22 |                                                           low_cpu_mem_usage=True,
23 |                                                           device_map='auto',
24 |                                                           trust_remote_code=True).eval()
25 |         vision_tower = self.model.get_vision_tower()
26 |         vision_tower.load_model()
27 |         vision_tower.to(device='cuda', dtype=torch.float16)
28 |         self.image_processor = vision_tower.image_processor
29 |         self.tokenizer.pad_token = self.tokenizer.eos_token
30 |         self.kwargs = kwargs
31 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
32 |         torch.cuda.empty_cache()
33 | 
34 |     def generate(self, message, dataset=None):
35 | 
36 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
37 |         print(prompt)
38 |         image = Image.open(image_path).convert('RGB')
39 |         terminators = [
40 |             self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
41 |         ]
42 |         inputs = self.model.build_conversation_input_ids(self.tokenizer,
43 |                                                          query=prompt,
44 |                                                          image=image,
45 |                                                          image_processor=self.image_processor)
46 |         input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
47 |         images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
48 | 
49 |         output_ids = self.model.generate(input_ids=input_ids,
50 |                                          images=images,
51 |                                          do_sample=False,
52 |                                          num_beams=1,
53 |                                          max_new_tokens=512,
54 |                                          eos_token_id=terminators,
55 |                                          use_cache=True)
56 | 
57 |         input_token_len = input_ids.shape[1]
58 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
59 |         response = outputs.strip()
60 | 
61 |         return response
62 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/set_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set, str_to_set
 2 | 
 3 | 
 4 | def _convert_to_hashable(item):
 5 |     """将不可哈希的类型转换为可哈希类型"""
 6 |     if isinstance(item, (list, tuple)):
 7 |         return tuple(item)  # 将列表转换为元组
 8 |     return item
 9 | 
10 | 
11 | class SetEquality:
12 |     """Determines whether two sets are equal."""
13 | 
14 |     @classmethod
15 |     def match(cls, responses, targets) -> int:
16 |         """Exact match between targets and responses."""
17 |         if isinstance(responses, (list, tuple)):
18 |             responses = {_convert_to_hashable(item) for item in responses}
19 |         if isinstance(targets, (list, tuple)):
20 |             targets = {_convert_to_hashable(item) for item in targets}
21 |         return 1 if responses == targets else 0
22 | 
23 | 
24 | class SetEqualityCaseInsensitive:
25 |     """Determines whether two sets are equal, ignoring string case."""
26 | 
27 |     @classmethod
28 |     def match(cls, responses, targets) -> int:
29 |         """Exact match between targets and responses."""
30 |         try:
31 |             responses: set[str] = {text.upper() for text in cast_to_set(responses)}
32 |             targets: set[str] = {text.upper() for text in cast_to_set(targets)}
33 |         except AttributeError:
34 |             return 0
35 |         return 1 if responses == targets else 0
36 | 
37 | 
38 | class StringSetEqualityLineSplit:
39 |     """Determines whether two sets are equal, for string inputs, separated by line breaks"""
40 | 
41 |     @classmethod
42 |     def match(cls, responses, targets) -> int:
43 |         if "\\n" in targets:
44 |             targets = targets.replace("\\n", "\n")
45 |         if "\\n" in responses:
46 |             responses = responses.replace("\\n", "\n")
47 |         responses_set = set(responses.split("\n"))
48 |         targets_set = set(targets.split("\n"))
49 |         responses_set = {
50 |             item.lower() if isinstance(item, str) else item for item in responses_set
51 |         }
52 |         targets_set = {
53 |             item.lower() if isinstance(item, str) else item for item in targets_set
54 |         }
55 |         return 1 if responses_set == targets_set else 0
56 | 
57 | 
58 | class StringSetEqualityCommaSplit:
59 |     """Determines whether two sets are equal, for string inputs, separated by commas
60 |     Handles some corner cases that would fail the general SetEquality metric, like the string
61 |     with "None", which fails the eval. Also do case-insensitive eval.
62 |     """
63 | 
64 |     @classmethod
65 |     def match(cls, responses, targets) -> int:
66 |         responses_set = str_to_set(responses)
67 |         targets_set = str_to_set(targets)
68 |         responses_set = {
69 |             item.lower() if isinstance(item, str) else item for item in responses_set
70 |         }
71 |         targets_set = {
72 |             item.lower() if isinstance(item, str) else item for item in targets_set
73 |         }
74 |         return 1 if responses_set == targets_set else 0
75 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/avos_action_recognition.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os
 3 | from PIL import Image
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class AVOSActionRecognition(Dataset):
 8 |     def __init__(self, config, split, transform=None, use_api=False):
 9 |         assert split in ['train', 'val', 'test']
10 |         if split == 'val':
11 |             split = 'test'
12 |         self.data_dir = config['data_config']['data_dir']
13 |         self.image_dir = os.path.join(self.data_dir, 'images')
14 | 
15 |         self.transform = transform
16 |         self.split = split
17 |         self.map = {action: idx for idx, action in enumerate(config['label_names'])}
18 |         self.labels = self.load_labels()
19 |         self.labels = self.labels[::10]  # TODO remove this line - for debugging
20 | 
21 |     def load_labels(self):
22 |         labels = []
23 | 
24 |         video_names_path = os.path.join(self.data_dir, f'{self.split}.csv')
25 |         video_names = []
26 |         with open(video_names_path, 'r') as f:
27 |             lines = f.readlines()
28 |             for line in lines[1:]:
29 |                 video_names.append(line.strip())
30 | 
31 |         annotations_path = os.path.join(self.data_dir, 'open_surgery_temporal_annotations_Jan16.csv')
32 |         annotations_df = pd.read_csv(annotations_path)
33 |         train_split = [line.strip() for line in open(os.path.join(self.data_dir, 'train.csv'), 'r').readlines()[1:]]
34 |         test_split = [line.strip() for line in open(os.path.join(self.data_dir, 'test.csv'), 'r').readlines()[1:]]
35 |         not_train_or_test = 0
36 |         for image_name in os.listdir(self.image_dir):
37 |             video_name = ''.join(image_name.split('-')[:-1])
38 |             if video_name not in train_split and video_name not in test_split:
39 |                 not_train_or_test += 1
40 |             if self.split == 'train' and video_name not in train_split: continue
41 |             if self.split == 'test' and video_name not in test_split: continue
42 |             frame_number = int(image_name.split('-')[-1].replace('.jpg', ''))
43 |             ann = annotations_df[(annotations_df['video_id'] == video_name) & (annotations_df['start_frame'] <= frame_number) & (annotations_df['end_frame'] >= frame_number)]
44 |             if len(ann) == 0: continue
45 |             else:
46 |                 ann = ann.iloc[0]
47 |                 action_labels = ann[['label']]
48 |                 if len(set(action_labels)) == 3: continue
49 | 
50 |                 action = action_labels.mode()[0]
51 |                 if action == 'none' or action == 'abstain': continue
52 |                 frame_path = os.path.join(self.image_dir, image_name)
53 |                 labels.append((frame_path, action))
54 | 
55 |         return labels
56 | 
57 |     def __len__(self):
58 |         return len(self.labels)
59 | 
60 |     def __getitem__(self, idx):
61 |         frame_name, frame_label = self.labels[idx]
62 |         frame = {'path': frame_name}
63 |         return (
64 |             frame,
65 |             frame_label
66 |         )


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/valley_eagle/model/token_compressor/roipool.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | 
 4 | class ROIPoolTokenCompressor(nn.Module):
 5 |     """
 6 |     A Pytorch module for compressing tokens using RoI Pooling.
 7 | 
 8 |     This module performs RoI Pooling on the input tensor to reduce its spatial dimensions
 9 |     by specified max_vision_token and mode.
10 | 
11 |     Attributes:
12 |         max_vision_token (int): The max vision token number.
13 |         mode (str): The mode for RoI Pooling. Default is "single". Options: "single" or "multiple".
14 | 
15 |     Note:
16 |         When mode is "single", max_vision_token means the max vision token number of
17 |         one image (no cropping) or one tile (cropping).
18 |         When mode is "multiple", max_vision_token means the max vision token number of
19 |         all tiles (only for cropping).
20 | 
21 |     Example:
22 |     >>> compressor = ROIPoolTokenCompressor(max_vision_token=64, mode="single")
23 |     >>> input_tensor = torch.randn(1, 256, 4096) # Shape: [B, N, dim], B means the number of images
24 |     >>> output_tensor = compressor(input_tensor)
25 |     >>> print(output_tensor.shape) # Expected shape: [1, 64, 4096]
26 | 
27 |     >>> compressor = ROIPoolTokenCompressor(max_vision_token="4*64", mode="multiple")
28 |     >>> input_tensor = torch.randn(4, 256, 4096) # Shape: [B, N, dim], B means the number of tiles of one image
29 |     >>> output_tensor = compressor(input_tensor)
30 |     >>> print(output_tensor.shape) # Expected shape: [4, 64, 4096]
31 |     """
32 | 
33 |     def __init__(self, max_vision_token, mode="single") -> None:
34 |         super(ROIPoolTokenCompressor, self).__init__()
35 |         assert mode in ["single", "multiple"], "Unspported mode for ROIPoolTokenCompressor"
36 |         if type(max_vision_token) is str:
37 |             max_vision_token = eval(max_vision_token)
38 |         self.max_vision_token = max_vision_token
39 |         self.mode = mode
40 | 
41 |     def _inner_forward(self, x):
42 |         B, N, dim = x.shape
43 |         H = W = int(N ** 0.5)
44 | 
45 |         if self.mode == "single" and N > self.max_vision_token:
46 |             H_new = W_new = int(self.max_vision_token ** 0.5)
47 |             x = x.view(B, H, W, dim).permute(0, 3, 1, 2)
48 |             # different from roi pooling, but in square image, it seems the same
49 |             x = nn.AdaptiveAvgPool2d((H_new, W_new))(x)
50 |             x = x.permute(0, 2, 3, 1).view(B, -1, dim)
51 | 
52 |         elif self.mode == "multiple" and (B * N) > self.max_vision_token:
53 |             H_new = W_new = int((self.max_vision_token / B) ** 0.5)
54 |             x = x.view(B, H, W, dim).permute(0, 3, 1, 2)
55 |             # different from roi pooling, but in square image, it seems the same
56 |             x = nn.AdaptiveAvgPool2d((H_new, W_new))(x)
57 |             x = x.permute(0, 2, 3, 1).view(B, -1, dim)
58 | 
59 |         return x
60 | 
61 |     def forward(self, x):
62 |         if type(x) is list:
63 |             x = [self._inner_forward(item.unsqueeze(0)).squeeze(0) for item in x]
64 |         else:
65 |             x = self._inner_forward(x)
66 |         return x
67 | 


--------------------------------------------------------------------------------
/vlmeval/api/glm_vision.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | requests.packages.urllib3.disable_warnings()
 3 | 
 4 | from vlmeval.smp import *
 5 | from vlmeval.api.base import BaseAPI
 6 | from vlmeval.dataset import DATASET_TYPE
 7 | from vlmeval.smp.vlm import encode_image_file_to_base64
 8 | 
 9 | 
10 | class GLMVisionWrapper(BaseAPI):
11 | 
12 |     is_api: bool = True
13 | 
14 |     def __init__(self,
15 |                  model: str,
16 |                  retry: int = 5,
17 |                  wait: int = 5,
18 |                  key: str = None,
19 |                  verbose: bool = True,
20 |                  system_prompt: str = None,
21 |                  max_tokens: int = 4096,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 |                  
25 |         from zhipuai import ZhipuAI
26 |         self.model = model
27 |         self.fail_msg = 'Failed to obtain answer via API. '
28 |         if key is None:
29 |             key = os.environ.get('GLMV_API_KEY', None)
30 |         assert key is not None, (
31 |             'Please set the API Key (obtain it here: '
32 |             'https://bigmodel.cn)'
33 |         )
34 |         self.client = ZhipuAI(api_key=key)
35 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
36 | 
37 |     def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
38 |         msgs = cp.deepcopy(msgs_raw)
39 |         content = []
40 |         for i, msg in enumerate(msgs):
41 |             if msg['type'] == 'text':
42 |                 content.append(dict(type='text', text=msg['value']))
43 |             elif msg['type'] == 'image':
44 |                 content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
45 |         if dataset in {'HallusionBench', 'POPE'}:
46 |             content.append(dict(type="text", text="Please answer yes or no."))
47 |         ret = [dict(role='user', content=content)]
48 |         return ret
49 | 
50 |     def generate_inner(self, inputs, **kwargs) -> str:
51 |         assert isinstance(inputs, str) or isinstance(inputs, list)
52 |         inputs = [inputs] if isinstance(inputs, str) else inputs
53 | 
54 |         messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
55 | 
56 |         response = self.client.chat.completions.create(
57 |             model=self.model,
58 |             messages=messages,
59 |             do_sample=False,
60 |             max_tokens=2048
61 |         )
62 |         try:
63 |             answer = response.choices[0].message.content.strip()
64 |             if self.verbose:
65 |                 self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
66 |             return 0, answer, 'Succeeded!'
67 |         except Exception as err:
68 |             if self.verbose:
69 |                 self.logger.error(f'{type(err)}: {err}')
70 |                 self.logger.error(f'The input messages are {inputs}.')
71 |             return -1, self.fail_msg, ''
72 | 
73 | 
74 | class GLMVisionAPI(GLMVisionWrapper):
75 | 
76 |     def generate(self, message, dataset=None):
77 |         return super(GLMVisionAPI, self).generate(message, dataset=dataset)
78 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/slime.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from abc import abstractproperty
 4 | import sys
 5 | import os.path as osp
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..dataset import DATASET_TYPE
 9 | import copy
10 | 
11 | 
12 | class SliME(BaseModel):
13 | 
14 |     INSTALL_REQ = True
15 |     INTERLEAVE = True
16 | 
17 |     DEFAULT_IMAGE_TOKEN = '<image>'
18 |     IMAGE_TOKEN_INDEX = -200
19 | 
20 |     def __init__(self, model_path='yifanzhang114/SliME-Llama3-8B', **kwargs):
21 |         assert model_path is not None
22 |         try:
23 |             from llava.model.builder import load_pretrained_model
24 |             from llava.conversation import conv_templates
25 |             from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
26 |         except Exception as err:
27 |             logging.critical('Please install requirements on https://github.com/yfzhang114/SliME before using SliME')
28 |             raise err
29 | 
30 |         model_name = get_model_name_from_path(model_path)
31 |         tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name, device_map=None)
32 |         model.cuda().eval()
33 |         model.tie_weights()
34 | 
35 |         if 'llama3' in model_path.lower():
36 |             conv_mode = 'llama3'
37 |         elif 'vicuna' in model_path.lower():
38 |             conv_mode = 'v1'
39 |         self.conv_template = conv_mode
40 |         self.conv_templates = conv_templates
41 |         self.tokenizer = tokenizer
42 |         self.model = model
43 |         self.image_processor = image_processor
44 |         self.tokenizer_image_token = tokenizer_image_token
45 | 
46 |     def generate_inner(self, message, dataset=None):
47 |         content, images = '', []
48 |         for msg in message:
49 |             if msg['type'] == 'text':
50 |                 content += msg['value']
51 |             else:
52 |                 images.append(Image.open(msg['value']).convert('RGB'))
53 |                 content += (self.DEFAULT_IMAGE_TOKEN + '\n')
54 | 
55 |         preprocess = self.image_processor.preprocess
56 |         image_tokenizer = self.tokenizer_image_token
57 |         image_tensor = [
58 |             preprocess(f, return_tensors='pt')['pixel_values'][0].half().cuda() for f in images
59 |         ]
60 |         image_tensor = torch.stack(image_tensor)
61 | 
62 |         conv = copy.deepcopy(self.conv_templates[self.conv_template])
63 |         conv.messages = list(conv.messages)
64 |         conv.append_message(conv.roles[0], content)
65 |         conv.append_message(conv.roles[1], None)
66 |         prompt_question = conv.get_prompt()
67 | 
68 |         input_ids = image_tokenizer(prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors='pt')
69 |         input_ids = input_ids.unsqueeze(0).cuda()
70 | 
71 |         cont = self.model.generate(
72 |             input_ids,
73 |             images=image_tensor,
74 |             do_sample=False,
75 |             temperature=0,
76 |             max_new_tokens=512,
77 |         )
78 |         text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
79 |         return text_outputs
80 | 


--------------------------------------------------------------------------------
/vlmeval/api/qwen_api.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | import os
 3 | from vlmeval.api.base import BaseAPI
 4 | from vlmeval.smp import *
 5 | 
 6 | 
 7 | # Note: This is a pure language model API.
 8 | class QwenAPI(BaseAPI):
 9 | 
10 |     is_api: bool = True
11 | 
12 |     def __init__(self,
13 |                  model: str = 'qwen-max-1201',
14 |                  retry: int = 5,
15 |                  wait: int = 5,
16 |                  verbose: bool = True,
17 |                  seed: int = 2680,
18 |                  temperature: float = 0.0,
19 |                  system_prompt: str = None,
20 |                  key: str = None,
21 |                  max_tokens: int = 2048,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 | 
25 |         assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26 |         self.model = model
27 |         import dashscope
28 |         self.fail_msg = 'Failed to obtain answer via API. '
29 |         self.max_tokens = max_tokens
30 |         self.temperature = temperature
31 |         self.seed = seed
32 |         if key is None:
33 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
34 |         assert key is not None, (
35 |             'Please set the API Key (obtain it here: '
36 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37 |         )
38 |         dashscope.api_key = key
39 |         if proxy is not None:
40 |             proxy_set(proxy)
41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 | 
43 |     @staticmethod
44 |     def build_msgs(msgs_raw, system_prompt=None):
45 |         msgs = cp.deepcopy(msgs_raw)
46 |         ret = []
47 |         if system_prompt is not None:
48 |             ret.append(dict(role='system', content=system_prompt))
49 |         for i, msg in enumerate(msgs):
50 |             role = 'user' if i % 2 == 0 else 'assistant'
51 |             ret.append(dict(role=role, content=msg))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         inputs = [inputs] if isinstance(inputs, str) else inputs
58 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59 | 
60 |         import dashscope
61 |         response = dashscope.Generation.call(
62 |             model=self.model,
63 |             messages=messages,
64 |             seed=self.seed,
65 |             temperature=self.temperature,
66 |             max_tokens=self.max_tokens,
67 |             result_format='message',  # set the result to be "message" format.
68 |         )
69 |         if response.status_code != HTTPStatus.OK:
70 |             return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71 | 
72 |         try:
73 |             return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74 |         except Exception as err:
75 |             return -1, f'Error: Failed to parse the response. {err}', response
76 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/mmgenbench.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import pandas as pd
 3 | from abc import abstractmethod
 4 | from ..smp import *
 5 | from .image_base import ImageBaseDataset
 6 | 
 7 | 
 8 | class MMGenBench(ImageBaseDataset):
 9 | 
10 |     prompt_list = [
11 |         """
12 | # Role
13 | You are an expert in the field of image understanding, focusing on the \
14 | understanding of images and generating the image caption-prompt.
15 | 
16 | # Definition Explanation
17 | image caption-prompt: Refers to the caption or description of an image, \
18 | used to provide to a Text-to-Image model to generate a new image.
19 | Text-to-Image model: Can generate a new image based on the provided image \
20 | caption-prompt, such as stable diffusion 3, flux, and other image generation models.
21 | 
22 | # Task Description
23 | Generate an image caption-prompt based on the input image.
24 | 
25 | # Key Points and Requirements
26 | 1. Accurately understand the input image and precisely generate an image caption-prompt.
27 | 2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
28 | Text-to-Image model to generate a new image that is as consistent as possible with the input image.
29 | 3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
30 | 4. The generated image caption-prompt should describe the input image in as much \
31 | detail as possible, and it should be between 20 to 60 words.
32 | 
33 | # Output Format
34 | A string, that is the image caption-prompt. No extra output needed.
35 | """
36 |     ]
37 |     TYPE = 'GenerateImgPrompt'
38 |     DATASET_URL = {
39 |         'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
40 |         'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
41 |     }
42 |     PROMPT_MAP = {
43 |         'MMGenBench-Test': prompt_list[0],
44 |         'MMGenBench-Domain': prompt_list[0],
45 |     }
46 |     DATASET_MD5 = {
47 |         'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
48 |         'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
49 |     }
50 | 
51 |     def __init__(self, dataset='MMGenBench', **kwargs):
52 |         super().__init__(dataset, **kwargs)
53 |         warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
54 |         warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
55 | 
56 |     def load_data(self, dataset):
57 |         data = super().load_data(dataset)
58 |         if 'question' not in data:
59 |             data['question'] = [(
60 |                 self.PROMPT_MAP[dataset]
61 |             )] * len(data)
62 |         return data
63 | 
64 |     # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
65 |     @abstractmethod
66 |     def evaluate(self, eval_file, **judge_kwargs):
67 |         warnings.warn('This evaluation method is not supported.\n')
68 |         warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
69 |         return None
70 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | torch.set_grad_enabled(False)
 4 | torch.manual_seed(1234)
 5 | from .aria import Aria
 6 | from .base import BaseModel
 7 | from .cogvlm import CogVlm, GLM4v
 8 | from .emu import Emu, Emu3_chat, Emu3_gen
 9 | from .eagle_x import Eagle
10 | from .idefics import IDEFICS, IDEFICS2
11 | from .instructblip import InstructBLIP
12 | from .kosmos import Kosmos2
13 | from .llava import (
14 |     LLaVA,
15 |     LLaVA_Next,
16 |     LLaVA_XTuner,
17 |     LLaVA_Next2,
18 |     LLaVA_OneVision,
19 |     LLaVA_OneVision_HF,
20 | )
21 | from .vita import VITA, VITAQwen2
22 | from .long_vita import LongVITA
23 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6
24 | from .minigpt4 import MiniGPT4
25 | from .mmalaya import MMAlaya, MMAlaya2
26 | from .monkey import Monkey, MonkeyChat
27 | from .moondream import Moondream1, Moondream2
28 | from .minimonkey import MiniMonkey
29 | from .mplug_owl2 import mPLUG_Owl2
30 | from .omnilmm import OmniLMM12B
31 | from .open_flamingo import OpenFlamingo
32 | from .pandagpt import PandaGPT
33 | from .qwen_vl import QwenVL, QwenVLChat
34 | from .qwen2_vl import Qwen2VLChat
35 | from .transcore_m import TransCoreM
36 | from .visualglm import VisualGLM
37 | from .xcomposer import (
38 |     ShareCaptioner,
39 |     XComposer,
40 |     XComposer2,
41 |     XComposer2_4KHD,
42 |     XComposer2d5,
43 | )
44 | from .yi_vl import Yi_VL
45 | from .internvl import InternVLChat
46 | from .deepseek_vl import DeepSeekVL
47 | from .deepseek_vl2 import DeepSeekVL2
48 | from .janus import Janus
49 | from .mgm import Mini_Gemini
50 | from .bunnyllama3 import BunnyLLama3
51 | from .vxverse import VXVERSE
52 | from .gemma import PaliGemma, Gemma3
53 | from .qh_360vl import QH_360VL
54 | from .phi3_vision import Phi3Vision, Phi3_5Vision
55 | from .phi4_multimodal import Phi4Multimodal
56 | from .wemm import WeMM
57 | from .cambrian import Cambrian
58 | from .chameleon import Chameleon
59 | from .video_llm import (
60 |     VideoLLaVA,
61 |     VideoLLaVA_HF,
62 |     Chatunivi,
63 |     VideoChatGPT,
64 |     LLaMAVID,
65 |     VideoChat2_HD,
66 |     PLLaVA,
67 | )
68 | from .vila import VILA
69 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2
70 | from .mantis import Mantis
71 | from .mixsense import LLama3Mixsense
72 | from .parrot import Parrot
73 | from .omchat import OmChat
74 | from .rbdash import RBDash
75 | from .xgen_mm import XGenMM
76 | from .slime import SliME
77 | from .mplug_owl3 import mPLUG_Owl3
78 | from .pixtral import Pixtral
79 | from .llama_vision import llama_vision
80 | from .molmo import molmo
81 | from .points import POINTS, POINTSV15
82 | from .nvlm import NVLM
83 | from .vintern_chat import VinternChat
84 | from .h2ovl_mississippi import H2OVLChat
85 | from .falcon_vlm import Falcon2VLM
86 | from .smolvlm import SmolVLM, SmolVLM2
87 | from .sail_vl import SailVL
88 | from .valley import ValleyEagleChat
89 | from .ross import Ross
90 | from .ola import Ola
91 | from .ursa import UrsaChat
92 | from .vlm_r1 import VLMR1Chat
93 | from .aki import AKI
94 | from .clip import CLIPVLMWrapper, CLIPOpenAIWrapper  # added by Anita Rau April 2025
95 | from .surgvlp import SurgVLPWrapper  # added by Anita Rau April 2025
96 | 


--------------------------------------------------------------------------------