├── .github ├── scripts │ └── assert_score.py └── workflows │ ├── lint.yml │ └── pr-run-test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets ├── LOGO.svg └── apple.jpg ├── docs ├── en │ ├── .readthedocs.yaml │ ├── ConfigSystem.md │ ├── Contributors.md │ ├── Development.md │ ├── EvalByLMDeploy.md │ ├── Makefile │ ├── Quickstart.md │ ├── _static │ │ ├── css │ │ │ └── readthedocs.css │ │ ├── image │ │ │ ├── logo.svg │ │ │ └── logo_icon.svg │ │ └── js │ │ │ └── custom.js │ ├── _templates │ │ ├── 404.html │ │ ├── autosummary │ │ │ └── class.rst │ │ └── callable.rst │ ├── conf.py │ ├── docutils.conf │ └── index.rst ├── ja │ └── README_ja.md └── zh-CN │ ├── .readthedocs.yaml │ ├── ConfigSystem.md │ ├── Development.md │ ├── EvalByLMDeploy.md │ ├── Makefile │ ├── Quickstart.md │ ├── README_zh-CN.md │ ├── _static │ ├── css │ │ └── readthedocs.css │ ├── image │ │ ├── logo.svg │ │ └── logo_icon.svg │ └── js │ │ └── custom.js │ ├── _templates │ ├── 404.html │ ├── autosummary │ │ └── class.rst │ └── callable.rst │ ├── conf.py │ ├── cp_origin_docs.sh │ ├── docutils.conf │ └── index.rst ├── requirements.txt ├── requirements └── docs.txt ├── run.py ├── scripts ├── AI2D_preproc.ipynb ├── apires_scan.py ├── auto_run.py ├── cover.sh ├── data_browser.py ├── mmb_eval_gradio.py ├── run.sh ├── srun.sh ├── summarize.py └── visualize.ipynb ├── setup.py └── vlmeval ├── __init__.py ├── api ├── __init__.py ├── bailingmm.py ├── base.py ├── bluelm_v_api.py ├── claude.py ├── cloudwalk.py ├── doubao_vl_api.py ├── gemini.py ├── glm_vision.py ├── gpt.py ├── hf_chat_model.py ├── hunyuan.py ├── jt_vl_chat.py ├── lmdeploy.py ├── mug_u.py ├── qwen_api.py ├── qwen_vl_api.py ├── reka.py ├── sensechat_vision.py ├── siliconflow.py ├── stepai.py ├── taichu.py └── taiyi.py ├── config.py ├── dataset ├── GUI │ ├── __init__.py │ ├── screenspot.py │ ├── screenspot_pro.py │ └── screenspot_v2.py ├── Omnidocbench │ ├── __init__.py │ ├── data_preprocess.py │ ├── metrics.py │ ├── omnidocbench.py │ ├── requirements.txt │ └── utils.py ├── __init__.py ├── cgbench.py ├── charxiv.py ├── cmmmu.py ├── creation.py ├── dude.py ├── dynamath.py ├── emma.py ├── image_base.py ├── image_caption.py ├── image_ccocr.py ├── image_mcq.py ├── image_mt.py ├── image_shortqa.py ├── image_vqa.py ├── image_yorn.py ├── longvideobench.py ├── megabench.py ├── miabench.py ├── mlvu.py ├── mmalignbench.py ├── mmbench_video.py ├── mmgenbench.py ├── mmifeval.py ├── mmlongbench.py ├── mmmath.py ├── moat.py ├── moviechat1k.py ├── mvbench.py ├── qbench_video.py ├── slidevqa.py ├── spatial457.py ├── tamperbench.py ├── tempcompass.py ├── text_base.py ├── text_mcq.py ├── utils │ ├── __init__.py │ ├── ccocr_evaluator │ │ ├── README.md │ │ ├── __init__.py │ │ ├── common.py │ │ ├── doc_parsing_evaluator.py │ │ ├── kie_evaluator.py │ │ └── ocr_evaluator.py │ ├── cgbench.py │ ├── crpe.py │ ├── hrbench.py │ ├── judge_util.py │ ├── llavabench.py │ ├── logicvista.py │ ├── longvideobench.py │ ├── mathv.py │ ├── mathverse.py │ ├── mathvista.py │ ├── megabench │ │ ├── README.md │ │ ├── __init__.py │ │ ├── aggregation │ │ │ ├── mean_agg.py │ │ │ ├── min_agg.py │ │ │ └── unsupported_agg.py │ │ ├── aggregation_type.py │ │ ├── evaluator.py │ │ ├── metric_type.py │ │ ├── parsing │ │ │ ├── answer_str_parse.py │ │ │ ├── common │ │ │ │ ├── parsers.py │ │ │ │ └── utils.py │ │ │ ├── dummy_parse.py │ │ │ └── json_parse.py │ │ ├── requirements.txt │ │ ├── response_parse_type.py │ │ ├── scoring │ │ │ ├── ascii_art_gpt4o_judge.py │ │ │ ├── chess_jaccard.py │ │ │ ├── common │ │ │ │ ├── conversions.py │ │ │ │ ├── metrics.py │ │ │ │ └── transformations.py │ │ │ ├── constrained_generation.py │ │ │ ├── coordinate_sequence_match.py │ │ │ ├── dict_equality.py │ │ │ ├── dict_exact_match_agg_recall.py │ │ │ ├── dict_jaccard_agg_jaccard.py │ │ │ ├── dict_nbbox_iou_tuple_agg_jaccard.py │ │ │ ├── dict_set_equality_agg_jaccard.py │ │ │ ├── exact_str_match.py │ │ │ ├── exact_str_match_case_insensitive.py │ │ │ ├── general_numerical_match.py │ │ │ ├── geo_proximity.py │ │ │ ├── gleu.py │ │ │ ├── jaccard.py │ │ │ ├── latex_expr_equality.py │ │ │ ├── longest_common_list_prefix_ratio.py │ │ │ ├── mse.py │ │ │ ├── multi_ref_phrase.py │ │ │ ├── nbbox_iou.py │ │ │ ├── near_str_match.py │ │ │ ├── nli_entailment.py │ │ │ ├── normalized_similarity_damerau_levenshtein.py │ │ │ ├── number_rel_diff_ratio.py │ │ │ ├── positive_int_match.py │ │ │ ├── program_judge.py │ │ │ ├── sacrebleu_bleu.py │ │ │ ├── sequence_equality.py │ │ │ ├── set_equality.py │ │ │ ├── set_precision.py │ │ │ ├── simple_str_match.py │ │ │ ├── symbolic_planning.py │ │ │ ├── unsupported_scoring.py │ │ │ ├── vlm_as_judge.py │ │ │ ├── xml_nbbox_iou.py │ │ │ ├── xml_norm_point_distance.py │ │ │ └── xml_norm_point_in_bbox.py │ │ └── utils.py │ ├── mlvu.py │ ├── mmbench_video.py │ ├── mmdu.py │ ├── mmif │ │ ├── __init__.py │ │ └── function_and_compare.py │ ├── mmniah.py │ ├── mmsci.py │ ├── mmsci4eval_req.txt │ ├── mmvet.py │ ├── moviechat1k.py │ ├── multiple_choice.py │ ├── mvbench.py │ ├── naturalbench.py │ ├── ocr_reasoning.py │ ├── ocrbench.py │ ├── olympiadbench.py │ ├── physic.py │ ├── physics_eval_utils.py │ ├── phyx.py │ ├── qbench_video.py │ ├── qspatial.py │ ├── shortqa.py │ ├── spatial457.py │ ├── tablevqabench.py │ ├── tamperbench.py │ ├── tdbench.py │ ├── tempcompass.py │ ├── vdc.py │ ├── vgrpbench │ │ ├── __init__.py │ │ ├── configs │ │ │ └── formating-prompt │ │ │ │ ├── aquarium │ │ │ │ └── filter_prompt.json │ │ │ │ ├── battleships │ │ │ │ └── filter_prompt.json │ │ │ │ ├── binairo │ │ │ │ └── filter_prompt.json │ │ │ │ ├── coloredsudoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── fieldexplore │ │ │ │ └── filter_prompt.json │ │ │ │ ├── futoshiki │ │ │ │ └── filter_prompt.json │ │ │ │ ├── hitori │ │ │ │ └── filter_prompt.json │ │ │ │ ├── jigsawsudoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── kakurasu │ │ │ │ └── filter_prompt.json │ │ │ │ ├── kakuro │ │ │ │ └── filter_prompt.json │ │ │ │ ├── killersudoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── lightup │ │ │ │ └── filter_prompt.json │ │ │ │ ├── nonogram │ │ │ │ └── filter_prompt.json │ │ │ │ ├── oddevensudoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── renzoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── skyscraper │ │ │ │ └── filter_prompt.json │ │ │ │ ├── starbattle │ │ │ │ └── filter_prompt.json │ │ │ │ ├── sudoku │ │ │ │ └── filter_prompt.json │ │ │ │ ├── thermometers │ │ │ │ └── filter_prompt.json │ │ │ │ └── treesandtents │ │ │ │ └── filter_prompt.json │ │ ├── evaluation.py │ │ ├── puzzles │ │ │ ├── aquarium.py │ │ │ ├── battleships.py │ │ │ ├── binairo.py │ │ │ ├── coloredsudoku.py │ │ │ ├── common_constriants.py │ │ │ ├── common_get_game_factory.py │ │ │ ├── common_get_prompt.py │ │ │ ├── common_puzzle_factory.py │ │ │ ├── fieldexplore.py │ │ │ ├── futoshiki.py │ │ │ ├── hitori.py │ │ │ ├── jigsawsudoku.py │ │ │ ├── kakurasu.py │ │ │ ├── kakuro.py │ │ │ ├── killersudoku.py │ │ │ ├── lightup.py │ │ │ ├── nonogram.py │ │ │ ├── oddevensudoku.py │ │ │ ├── renzoku.py │ │ │ ├── skyscraper.py │ │ │ ├── starbattle.py │ │ │ ├── sudoku.py │ │ │ ├── thermometers.py │ │ │ └── treesandtents.py │ │ └── score.py │ ├── video_mmlu.py │ ├── videomme.py │ ├── visulogic.py │ ├── vlm2bench.py │ ├── vmcbench.py │ ├── vqa_eval.py │ ├── wemath.py │ ├── worldsense.py │ └── yorn.py ├── vcr.py ├── vdc.py ├── video_base.py ├── video_concat_dataset.py ├── video_dataset_config.py ├── video_mmlu.py ├── videomme.py ├── vl_rewardbench.py ├── vlm2bench.py ├── wildvision.py └── worldsense.py ├── inference.py ├── inference_mt.py ├── inference_video.py ├── smp ├── __init__.py ├── file.py ├── log.py ├── misc.py └── vlm.py ├── tools.py ├── utils ├── __init__.py ├── matching_util.py ├── mp_util.py └── result_transfer.py └── vlm ├── __init__.py ├── aki.py ├── aria.py ├── base.py ├── bunnyllama3.py ├── cambrian.py ├── chameleon.py ├── cogvlm.py ├── deepseek_vl.py ├── deepseek_vl2.py ├── eagle_x.py ├── emu.py ├── falcon_vlm.py ├── flash_vl.py ├── gemma.py ├── h2ovl_mississippi.py ├── idefics.py ├── instructblip.py ├── internvl ├── __init__.py ├── gui_template.yaml ├── internvl_chat.py └── utils.py ├── janus.py ├── kimi_vl.py ├── kosmos.py ├── llama4.py ├── llama_vision.py ├── llava ├── __init__.py ├── llava.py └── llava_xtuner.py ├── long_vita.py ├── mantis.py ├── mgm.py ├── minicpm_v.py ├── minigpt4.py ├── minimonkey.py ├── misc ├── blip2_instruct_vicuna13b.yaml ├── blip2_instruct_vicuna7b.yaml ├── minigpt4_13b_eval.yaml ├── minigpt4_7b_eval.yaml └── minigptv2_eval.yaml ├── mixsense.py ├── mmalaya.py ├── molmo.py ├── monkey.py ├── moondream.py ├── mplug_owl2.py ├── mplug_owl3.py ├── nvlm.py ├── ola ├── __init__.py ├── ola │ ├── arguments.py │ ├── constants.py │ ├── conversation.py │ ├── datasets │ │ ├── __init__.py │ │ └── preprocess.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── language_model │ │ │ └── ola_qwen.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── oryx_vit.py │ │ ├── multimodal_projector │ │ │ ├── builder.py │ │ │ └── pooler_projector.py │ │ ├── multimodal_resampler │ │ │ ├── builder.py │ │ │ └── perceiver.py │ │ ├── ola_arch.py │ │ ├── speech_encoder │ │ │ ├── beats │ │ │ │ ├── BEATs.py │ │ │ │ ├── Tokenizers.py │ │ │ │ ├── __init__.py │ │ │ │ ├── backbone.py │ │ │ │ ├── kaldi.py │ │ │ │ ├── modules.py │ │ │ │ └── quantizer.py │ │ │ ├── builder.py │ │ │ └── speech_encoder.py │ │ └── speech_projector │ │ │ ├── builder.py │ │ │ └── speech_projector.py │ └── utils.py └── ola_model.py ├── omchat.py ├── omnilmm.py ├── open_flamingo.py ├── ovis ├── __init__.py ├── ovis.py └── utils │ ├── __init__.py │ └── mdp3.py ├── pandagpt.py ├── parrot.py ├── phi3_vision.py ├── phi4_multimodal.py ├── pixtral.py ├── points.py ├── qh_360vl.py ├── qwen2_vl ├── __init__.py ├── model.py └── prompt.py ├── qwen_vl.py ├── rbdash.py ├── ristretto.py ├── ross.py ├── sail_vl.py ├── slime.py ├── smolvlm.py ├── transcore_m.py ├── ursa ├── __init__.py ├── ursa_chat.py └── ursa_model │ ├── __init__.py │ ├── clip_encoder.py │ ├── configuration_ursa.py │ ├── image_processing_vlm.py │ ├── modeling_ursa.py │ ├── processing_ursa.py │ ├── projector.py │ ├── sam.py │ └── siglip_vit.py ├── valley ├── __init__.py ├── requirements_valley.txt └── valley.py ├── video_llm ├── __init__.py ├── chat_uni_vi.py ├── configs │ ├── llama_vid │ │ └── processor │ │ │ └── clip-patch14-224 │ │ │ ├── config.json │ │ │ └── preprocessor_config.json │ └── videochat2_hd.json ├── llama_vid.py ├── pllava.py ├── video_chatgpt.py ├── video_llava.py └── videochat2.py ├── vila.py ├── vintern_chat.py ├── visualglm.py ├── vita.py ├── vlaa_thinker.py ├── vlm_r1.py ├── vxverse.py ├── wemm.py ├── wethink_vl.py ├── xcomposer ├── __init__.py ├── sharecaptioner.py ├── xcomposer.py ├── xcomposer2.py ├── xcomposer2_4KHD.py └── xcomposer2d5.py ├── xgen_mm.py └── yi_vl.py /.github/scripts/assert_score.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import ast 3 | import json 4 | import os 5 | 6 | import pandas as pd 7 | 8 | 9 | def validate_scores(dataset_list, assert_score, model_name): 10 | for dataset in dataset_list: 11 | base_score = assert_score[dataset][model_name] 12 | if dataset == "OCRBench_MINI": 13 | score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json") 14 | cur_score = 0 15 | with open(score_file, "r") as f: 16 | total_score = json.load(f) 17 | cur_score = total_score["Final Score Norm"] 18 | assert ( 19 | abs(cur_score - float(base_score)) <= 0.01 20 | ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}" 21 | else: 22 | score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv") 23 | df = pd.read_csv(score_file) 24 | cur_score = df["Overall"].iloc[0] 25 | if dataset == "MMBench_V11_MINI": 26 | cur_score = df.loc[df["split"] == "dev", "Overall"].values 27 | assert ( 28 | abs(cur_score - float(base_score)) <= 0.01 29 | ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}" 30 | print(f"cur_score is {cur_score}, base_score is {base_score}") 31 | 32 | 33 | def parse_arguments(): 34 | parser = argparse.ArgumentParser(description="Validate model scores against csv/json data") 35 | 36 | parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets") 37 | 38 | parser.add_argument( 39 | "--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}" 40 | ) 41 | 42 | parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate") 43 | 44 | return parser.parse_args() 45 | 46 | 47 | def main(): 48 | args = parse_arguments() 49 | 50 | try: 51 | dataset_list = args.dataset.split() 52 | base_score = ast.literal_eval(args.base_score) 53 | except Exception as e: 54 | print(f"Parameter parsing error: {str(e)}") 55 | return 56 | 57 | validate_scores(dataset_list, base_score, args.model_name) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.10 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.10.15 18 | - name: Install pre-commit hook 19 | run: | 20 | pip install pre-commit 21 | pre-commit install 22 | - name: Linting 23 | run: pre-commit run --all-files 24 | -------------------------------------------------------------------------------- /.github/workflows/pr-run-test.yml: -------------------------------------------------------------------------------- 1 | name: pr_run_test 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "main" 7 | paths-ignore: 8 | - "docs/**" 9 | - "**.md" 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 13 | cancel-in-progress: true 14 | 15 | env: 16 | BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2-VL-7B-Instruct":0.8727272727272727,"InternVL2_5-8B":0.8727272727272727,"llava_onevision_qwen2_7b_si":0.8363636363636363},"MMStar_MINI":{"Qwen2-VL-7B-Instruct":0.6266666666666667,"InternVL2_5-8B":0.6333333333333333,"llava_onevision_qwen2_7b_si":0.49333333333333335},"AI2D_MINI":{"Qwen2-VL-7B-Instruct":0.7854251012145749,"InternVL2_5-8B":0.8421052631578947,"llava_onevision_qwen2_7b_si":0.8178137651821862},"OCRBench_MINI":{"Qwen2-VL-7B-Instruct":16.6,"InternVL2_5-8B":16.4,"llava_onevision_qwen2_7b_si":12.9}}' 17 | 18 | jobs: 19 | vlm_test: 20 | if: ${{!cancelled()}} 21 | runs-on: [linux-a100] 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | model: [Qwen/Qwen2-VL-7B-Instruct,OpenGVLab/InternVL2_5-8B,lmms-lab/llava-onevision-qwen2-7b-si] 26 | dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"] 27 | container: 28 | image: kkscilife/vlmevalkit_2:a100 29 | options: "--gpus=all --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy --pull never" 30 | volumes: 31 | - /mnt/187:/mnt/187 32 | steps: 33 | - name: clone_repo 34 | uses: actions/checkout@v3 35 | - name: evaluation_model 36 | run: | 37 | pip install -e . 38 | pre_model=$(echo ${{matrix.model}} | awk -F'/' '{print $1}') 39 | ln -s /mnt/187/$pre_model . 40 | if [ "${{matrix.model}}" = "lmms-lab/llava-onevision-qwen2-7b-si" ];then 41 | model_name="llava_onevision_qwen2_7b_si" 42 | else 43 | model_name=$(echo ${{matrix.model}} | awk -F'/' '{print $2}') 44 | fi 45 | nvidia-smi 46 | python run.py --data ${{matrix.dataset}} --model $model_name 47 | python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name $model_name 48 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: | 2 | (?x)^( 3 | scripts/| 4 | assets/| 5 | vlmeval/config.py | 6 | vlmeval/dataset/utils/wemath.py | 7 | vlmeval/dataset/Omnidocbench/ | 8 | vlmeval/dataset/utils/megabench/ | 9 | vlmeval/dataset/utils/vgrpbench/ | 10 | vlmeval/vlm/ola/ | 11 | vlmeval/vlm/ursa/ | 12 | vlmeval/vlm/ovis/ 13 | ) 14 | repos: 15 | - repo: https://github.com/PyCQA/flake8 16 | rev: 6.1.0 17 | hooks: 18 | - id: flake8 19 | args: 20 | [ 21 | "--max-line-length=120", 22 | "--ignore=F401,F403,F405,E402,E722,E741,W503,E231,E702", 23 | ] 24 | exclude: ^configs/ 25 | - repo: https://github.com/pre-commit/mirrors-yapf 26 | rev: v0.30.0 27 | hooks: 28 | - id: yapf 29 | args: ["--style={column_limit=120}"] 30 | - repo: https://github.com/pre-commit/pre-commit-hooks 31 | rev: v3.1.0 32 | hooks: 33 | - id: trailing-whitespace 34 | - id: check-yaml 35 | - id: end-of-file-fixer 36 | - id: requirements-txt-fixer 37 | - id: check-merge-conflict 38 | - id: fix-encoding-pragma 39 | args: ["--remove"] 40 | - id: mixed-line-ending 41 | args: ["--fix=lf"] 42 | -------------------------------------------------------------------------------- /assets/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/assets/apple.jpg -------------------------------------------------------------------------------- /docs/en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.8" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/en/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | -------------------------------------------------------------------------------- /docs/en/Contributors.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | ## Contributors w. 3+ Major Contributions 4 | 5 | > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit. 6 | 7 | New Qualified Contributors (2024.09): 8 | 9 | 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM 10 | 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.) 11 | 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit 12 | 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE 13 | 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench 14 | 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID 15 | 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5) 16 | 17 | ## Full Contributor List 18 | 19 | > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit. 20 | 21 | TBD. 22 | -------------------------------------------------------------------------------- /docs/en/EvalByLMDeploy.md: -------------------------------------------------------------------------------- 1 | # Using LMDeploy to Accelerate Evaluation and Inference 2 | 3 | VLMEvalKit supports testing VLM models deployed by LMDeploy. Below, we use InternVL2-8B as an example to show how to test the model. 4 | 5 | ## Step 0: Install LMDeploy 6 | 7 | ```bash 8 | pip install lmdeploy 9 | ``` 10 | For other installation methods, you can refer to LMDeploy's [documentation](https://github.com/InternLM/lmdeploy). 11 | 12 | ## Step 1: Start the Inference Service 13 | 14 | ```bash 15 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B 16 | ``` 17 | > [!IMPORTANT] 18 | > Since models in VLMEvalKit may have custom behaviors when building prompts for different datasets, such as InternVL2's handling of HallusionBench, it is necessary to specify `--model-name` when starting the server. This allows the VLMEvalKit to select appropriate prompt construction strategy based on the name when using the LMDeploy API. 19 | > 20 | > If `--server-port`, is specified, the corresponding environment variable `LMDEPLOY_API_BASE` needs to be set. 21 | 22 | 23 | ## Step 2: Evaluation 24 | 25 | ```bash 26 | python run.py --data MMStar --model lmdeploy --verbose --api-nproc 64 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/en/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/en/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | .header-logo { 2 | background-image: url("../image/logo.svg"); 3 | background-size: 275px 80px; 4 | height: 80px; 5 | width: 275px; 6 | } 7 | 8 | 9 | @media screen and (min-width: 1100px) { 10 | .header-logo { 11 | top: -25px; 12 | } 13 | } 14 | 15 | pre { 16 | white-space: pre; 17 | } 18 | 19 | @media screen and (min-width: 2000px) { 20 | .pytorch-content-left { 21 | width: 1200px; 22 | margin-left: 30px; 23 | } 24 | article.pytorch-article { 25 | max-width: 1200px; 26 | } 27 | .pytorch-breadcrumbs-wrapper { 28 | width: 1200px; 29 | } 30 | .pytorch-right-menu.scrolling-fixed { 31 | position: fixed; 32 | top: 45px; 33 | left: 1580px; 34 | } 35 | } 36 | 37 | 38 | article.pytorch-article section code { 39 | padding: .2em .4em; 40 | background-color: #f3f4f7; 41 | border-radius: 5px; 42 | } 43 | 44 | /* Disable the change in tables */ 45 | article.pytorch-article section table code { 46 | padding: unset; 47 | background-color: unset; 48 | border-radius: unset; 49 | } 50 | 51 | table.autosummary td { 52 | width: 50% 53 | } 54 | 55 | img.align-center { 56 | display: block; 57 | margin-left: auto; 58 | margin-right: auto; 59 | } 60 | 61 | article.pytorch-article p.rubric { 62 | font-weight: bold; 63 | } 64 | -------------------------------------------------------------------------------- /docs/en/_static/image/logo_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/en/_static/js/custom.js: -------------------------------------------------------------------------------- 1 | var collapsedSections = []; 2 | 3 | $(document).ready(function () { 4 | $('.model-summary').DataTable({ 5 | "stateSave": false, 6 | "lengthChange": false, 7 | "pageLength": 20, 8 | "order": [] 9 | }); 10 | }); 11 | -------------------------------------------------------------------------------- /docs/en/_templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block body %} 4 | 5 |

Page Not Found

6 |

7 | The page you are looking for cannot be found. 8 |

9 |

10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in 11 | the content table left, or go to the homepage. 12 |

13 | 17 | 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /docs/en/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | .. 12 | autogenerated from _templates/autosummary/class.rst 13 | note it does not have :inherited-members: 14 | -------------------------------------------------------------------------------- /docs/en/_templates/callable.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | :special-members: __call__ 11 | 12 | .. 13 | autogenerated from _templates/callable.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /docs/en/docutils.conf: -------------------------------------------------------------------------------- 1 | [html writers] 2 | table_style: colwidths-auto 3 | -------------------------------------------------------------------------------- /docs/en/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to the VLMEvalKit Tutorial! 2 | ========================================== 3 | 4 | VLMEvalKit Getting Started Guide 5 | ------------------------------- 6 | 7 | To help users get started quickly, we recommend the following process: 8 | 9 | - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process. 10 | 11 | - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial." 12 | 13 | We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit! 14 | 15 | .. _Start Your First Step: 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Start Your First Step 19 | 20 | Quickstart.md 21 | 22 | .. _Advanced Tutorial: 23 | .. toctree:: 24 | :maxdepth: 1 25 | :caption: Advanced Tutorial 26 | 27 | Development.md 28 | ConfigSystem.md 29 | 30 | .. _Other Notes: 31 | .. toctree:: 32 | :maxdepth: 1 33 | :caption: Other Notes 34 | 35 | Contributors.md 36 | 37 | Index and Tables 38 | ================== 39 | 40 | * :ref:`genindex` 41 | * :ref:`search` 42 | -------------------------------------------------------------------------------- /docs/zh-CN/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.8" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/zh-CN/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | -------------------------------------------------------------------------------- /docs/zh-CN/ConfigSystem.md: -------------------------------------------------------------------------------- 1 | 2 | # 配置系统 3 | 4 | 默认情况下,VLMEvalKit通过在`run.py`脚本中使用`--model`和`--data`参数设置模型名称(在`/vlmeval/config.py`中定义)和数据集名称(在`vlmeval/dataset/__init__.py` 或 `vlmeval/dataset/video_dataset_config.py` 中定义)来启动评估。这种方法在大多数情况下简单且高效,但当用户希望使用不同设置评估多个模型/数据集时,可能不够灵活。 5 | 6 | 为了解决这个问题,VLMEvalKit提供了一个更灵活的配置系统。用户可以在json文件中指定模型和数据集设置,并通过`--config`参数将配置文件的路径传递给`run.py`脚本。以下是一个示例配置json: 7 | 8 | ```json 9 | { 10 | "model": { 11 | "GPT4o_20240806_T00_HIGH": { 12 | "class": "GPT4V", 13 | "model": "gpt-4o-2024-08-06", 14 | "temperature": 0, 15 | "img_detail": "high" 16 | }, 17 | "GPT4o_20240806_T10_Low": { 18 | "class": "GPT4V", 19 | "model": "gpt-4o-2024-08-06", 20 | "temperature": 1.0, 21 | "img_detail": "low" 22 | }, 23 | "GPT4o_20241120": {} 24 | }, 25 | "data": { 26 | "MME-RealWorld-Lite": { 27 | "class": "MMERealWorld", 28 | "dataset": "MME-RealWorld-Lite" 29 | }, 30 | "MMBench_DEV_EN_V11": { 31 | "class": "ImageMCQDataset", 32 | "dataset": "MMBench_DEV_EN_V11" 33 | }, 34 | "MMBench_Video_8frame_nopack":{}, 35 | "Video-MME_16frame_subs": { 36 | "class": "VideoMME", 37 | "dataset": "Video-MME", 38 | "nframe": 16, 39 | "use_subtitle": true 40 | } 41 | } 42 | } 43 | ``` 44 | 45 | 配置json的解释: 46 | 47 | 1. 现在我们支持两个字段:`model`和`data`,每个字段都是一个字典。字典的键是模型/数据集的名称(由用户设置),值是模型/数据集的设置。 48 | 2. 对于`model`中的项目,值是一个包含以下键的字典: 49 | - `class`:模型的类名,应该是`vlmeval/vlm/__init__.py`(开源模型)或`vlmeval/api/__init__.py`(API模型)中定义的类名。 50 | - 其他kwargs:其他kwargs是模型特定的参数,请参考模型类的定义以获取详细用法。例如,`model`、`temperature`、`img_detail`是`GPT4V`类的参数。值得注意的是,大多数模型类都需要`model`参数。 51 | - Tip:在位于`vlmeval/config.py`的变量`supported_VLM`中的已经被定义的模型可以作为`model`的键,而不需要填对应的值即可启动。例如,`GPT4o_20240806_T00_HIGH: {}`是等价于`GPT4o_20240806_T00_HIGH: {'class': 'GPT4V', 'model': 'gpt-4o-2024-08-06', 'temperature': 0, 'img_size': -1, 'img_detail': 'high', 'retry': 10, 'verbose': False}`。 52 | 3. 对于字典`data`,我们建议用户使用官方数据集名称作为键(或键的一部分),因为我们经常根据数据集名称确定后处理/判断设置。对于`data`中的项目,值是一个包含以下键的字典: 53 | - `class`:数据集的类名,应该是`vlmeval/dataset/__init__.py`中定义的类名。 54 | - 其他kwargs:其他kwargs是数据集特定的参数,请参考数据集类的定义以获取详细用法。通常,大多数数据集类都需要`dataset`参数。大多数视频数据集类都需要 `nframe` 或 `fps` 参数。 55 | - Tip:在位于`vlmeval/dataset/video_dataset_config.py`的变量`supported_video_dataset`中的已经被定义的数据集可以作为`data`的键,而不需要填对应的值即可启动。例如,`MMBench_Video_8frame_nopack: {}`是等价于`MMBench_Video_8frame_nopack: {'class': 'MMBenchVideo', 'dataset': 'MMBench-Video', 'nframe': 8, 'pack': False}`。 56 | 57 | 将示例配置json保存为`config.json`,您可以通过以下命令启动评估: 58 | 59 | ```bash 60 | python run.py --config config.json 61 | ``` 62 | 63 | 这将在工作目录`$WORK_DIR`下生成以下输出文件(格式为`{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`): 64 | 65 | - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*` 66 | - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*` 67 | - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*` 68 | - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*` 69 | ...... 70 | -------------------------------------------------------------------------------- /docs/zh-CN/EvalByLMDeploy.md: -------------------------------------------------------------------------------- 1 | # 使用 LMDeploy 加速评测推理 2 | 3 | VLMEvalKit 支持测试由 LMDeploy 部署的 VLM 模型,下面以 InternVL2-8B 为例,展示如何测试模型 4 | 5 | ## 第0步 安装 LMDeploy 6 | 7 | ```bash 8 | pip install lmdeploy 9 | ``` 10 | 11 | 其他安装方式可以参考 LMDeploy 的[文档](https://github.com/InternLM/lmdeploy) 12 | 13 | ## 第1步 启动推理服务 14 | 15 | ```bash 16 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B 17 | ``` 18 | > [!IMPORTANT] 19 | > 因为 VLMEvalKit 中的模型对于不同数据集在构建 prompt 时可能有自定义行为,如 InternVL2 对于 HallusionBench 的处理,所以,server 端在启动的时候需要指定 `--model-name`,这样在使用 LMDEploy api 时可以根据名字选择合适的 prompt 构建策略。 20 | > 21 | > 如果指定了 `--server-port`,需要设置对应的环境变量 `LMDEPLOY_API_BASE` 22 | 23 | 24 | ## 第2步 评测 25 | 26 | ```bash 27 | python run.py --data MMStar --model InternVL2-8B --verbose --api-nproc 64 28 | ``` 29 | -------------------------------------------------------------------------------- /docs/zh-CN/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/zh-CN/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | .header-logo { 2 | background-image: url("../image/logo.svg"); 3 | background-size: 275px 80px; 4 | height: 80px; 5 | width: 275px; 6 | } 7 | 8 | 9 | @media screen and (min-width: 1100px) { 10 | .header-logo { 11 | top: -25px; 12 | } 13 | } 14 | 15 | pre { 16 | white-space: pre; 17 | } 18 | 19 | @media screen and (min-width: 2000px) { 20 | .pytorch-content-left { 21 | width: 1200px; 22 | margin-left: 30px; 23 | } 24 | article.pytorch-article { 25 | max-width: 1200px; 26 | } 27 | .pytorch-breadcrumbs-wrapper { 28 | width: 1200px; 29 | } 30 | .pytorch-right-menu.scrolling-fixed { 31 | position: fixed; 32 | top: 45px; 33 | left: 1580px; 34 | } 35 | } 36 | 37 | 38 | article.pytorch-article section code { 39 | padding: .2em .4em; 40 | background-color: #f3f4f7; 41 | border-radius: 5px; 42 | } 43 | 44 | /* Disable the change in tables */ 45 | article.pytorch-article section table code { 46 | padding: unset; 47 | background-color: unset; 48 | border-radius: unset; 49 | } 50 | 51 | table.autosummary td { 52 | width: 50% 53 | } 54 | 55 | img.align-center { 56 | display: block; 57 | margin-left: auto; 58 | margin-right: auto; 59 | } 60 | 61 | article.pytorch-article p.rubric { 62 | font-weight: bold; 63 | } 64 | -------------------------------------------------------------------------------- /docs/zh-CN/_static/image/logo_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/zh-CN/_static/js/custom.js: -------------------------------------------------------------------------------- 1 | var collapsedSections = []; 2 | 3 | $(document).ready(function () { 4 | $('.model-summary').DataTable({ 5 | "stateSave": false, 6 | "lengthChange": false, 7 | "pageLength": 20, 8 | "order": [] 9 | }); 10 | }); 11 | -------------------------------------------------------------------------------- /docs/zh-CN/_templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block body %} 4 | 5 |

Page Not Found

6 |

7 | The page you are looking for cannot be found. 8 |

9 |

10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in 11 | the content table left, or go to the homepage. 12 |

13 | 17 | 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /docs/zh-CN/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | .. 12 | autogenerated from _templates/autosummary/class.rst 13 | note it does not have :inherited-members: 14 | -------------------------------------------------------------------------------- /docs/zh-CN/_templates/callable.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | :special-members: __call__ 11 | 12 | .. 13 | autogenerated from _templates/callable.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /docs/zh-CN/cp_origin_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copy *.md files from docs/ if it doesn't have a Chinese translation 4 | 5 | for filename in $(find ../en/ -name '*.md' -printf "%P\n"); 6 | do 7 | mkdir -p $(dirname $filename) 8 | cp -n ../en/$filename ./$filename 9 | done 10 | -------------------------------------------------------------------------------- /docs/zh-CN/docutils.conf: -------------------------------------------------------------------------------- 1 | [html writers] 2 | table_style: colwidths-auto 3 | -------------------------------------------------------------------------------- /docs/zh-CN/index.rst: -------------------------------------------------------------------------------- 1 | 欢迎来到 VLMEvalKit 中文教程! 2 | ========================================== 3 | 4 | VLMEvalKit 上手路线 5 | ------------------------------- 6 | 7 | 为了用户能够快速上手,我们推荐以下流程: 8 | 9 | - 对于想要使用 VLMEvalKit 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境,并启动一个迷你实验熟悉流程。 10 | 11 | - 若您想进行更多模块的自定义,例如增加数据集和模型,我们提供了 进阶教程_ 。 12 | 13 | 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit! 14 | 15 | .. _快速开始: 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: 快速开始 19 | 20 | Quickstart.md 21 | 22 | 23 | .. .. _教程: 24 | .. .. toctree:: 25 | .. :maxdepth: 1 26 | .. :caption: 教程 27 | 28 | .. user_guides/framework_overview.md 29 | 30 | .. _进阶教程: 31 | .. toctree:: 32 | :maxdepth: 1 33 | :caption: 进阶教程 34 | 35 | Development.md 36 | ConfigSystem.md 37 | 38 | .. .. _其他说明: 39 | .. .. toctree:: 40 | .. :maxdepth: 1 41 | .. :caption: 其他说明 42 | 43 | .. notes/contribution_guide.md 44 | 45 | 索引与表格 46 | ================== 47 | 48 | * :ref:`genindex` 49 | * :ref:`search` 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | dotenv 3 | einops 4 | # for gemini api 5 | google-genai 6 | gradio 7 | huggingface_hub 8 | imageio 9 | ipdb 10 | json_repair 11 | matplotlib 12 | nltk 13 | numpy 14 | omegaconf 15 | openai 16 | opencv-python>=4.4.0.46 17 | openpyxl 18 | pandas 19 | pillow 20 | portalocker 21 | protobuf 22 | python-dotenv 23 | qwen_vl_utils 24 | requests 25 | rich 26 | sentencepiece 27 | setuptools 28 | sty 29 | tabulate 30 | tiktoken 31 | timeout-decorator 32 | timm 33 | torch 34 | torchvision 35 | tqdm 36 | transformers 37 | typing_extensions 38 | validators 39 | xlsxwriter 40 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | docutils==0.18.1 2 | modelindex 3 | myst-parser 4 | -e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme 5 | sphinx==6.1.3 6 | sphinx-copybutton 7 | sphinx-design 8 | sphinx-notfound-page 9 | sphinx-tabs 10 | sphinxcontrib-jquery 11 | tabulate 12 | -------------------------------------------------------------------------------- /scripts/apires_scan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from vlmeval import * 3 | from vlmeval.dataset import SUPPORTED_DATASETS 4 | FAIL_MSG = 'Failed to obtain answer via API.' 5 | 6 | root = sys.argv[1] 7 | if root[-1] in '/\\': 8 | root = root[:-1] 9 | 10 | model_name = root.split('/')[-1] 11 | 12 | for d in SUPPORTED_DATASETS: 13 | fname = f'{model_name}_{d}.xlsx' 14 | pth = osp.join(root, fname) 15 | if osp.exists(pth): 16 | data = load(pth) 17 | # Detect Failure 18 | assert 'prediction' in data 19 | data['prediction'] = [str(x) for x in data['prediction']] 20 | fail = [FAIL_MSG in x for x in data['prediction']] 21 | if sum(fail): 22 | nfail = sum(fail) 23 | ntot = len(fail) 24 | print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ') 25 | 26 | eval_files = ls(root, match=f'{model_name}_{d}_') 27 | eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')] 28 | 29 | if len(eval_files) == 0: 30 | print(f'Model {model_name} x Dataset {d} openai missing') 31 | continue 32 | 33 | assert len(eval_files) == 1 34 | eval_file = eval_files[0] 35 | data = load(eval_file) 36 | 37 | if 'MMVet' in d: 38 | bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)] 39 | if len(bad): 40 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 41 | elif 'MathVista' in d: 42 | bad = [x for x in data['res'] if FAIL_MSG in str(x)] 43 | if len(bad): 44 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 45 | 46 | elif d == 'LLaVABench': 47 | sub = data[data['gpt4_score'] == -1] 48 | sub = sub[sub['gpt4_score'] == -1] 49 | if len(sub): 50 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.') 51 | else: 52 | bad = [x for x in data['log'] if FAIL_MSG in str(x)] 53 | if len(bad): 54 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 55 | -------------------------------------------------------------------------------- /scripts/auto_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from vlmeval.smp import * 3 | from vlmeval.config import supported_VLM 4 | 5 | def is_api(x): 6 | return getattr(supported_VLM[x].func, 'is_api', False) 7 | 8 | models = list(supported_VLM) 9 | models = [x for x in models if 'fs' not in x] 10 | models = [x for x in models if not is_api(x)] 11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2'] 12 | models = [x for x in models if x not in exclude_list] 13 | 14 | def is_large(x): 15 | return '80b' in x or 'emu2' in x or '34B' in x 16 | 17 | small_models = [x for x in models if not is_large(x)] 18 | large_models = [x for x in models if is_large(x)] 19 | models = small_models + large_models 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--data', type=str, nargs='+', required=True) 23 | args = parser.parse_args() 24 | 25 | # Skip some models 26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] 27 | 28 | for m in models: 29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')] 30 | if len(unknown_datasets) == 0: 31 | continue 32 | dataset_str = ' '.join(unknown_datasets) 33 | if '80b' in m: 34 | cmd = f'python run.py --data {dataset_str} --model {m}' 35 | else: 36 | cmd = f'bash run.sh --data {dataset_str} --model {m}' 37 | print(cmd) 38 | os.system(cmd) -------------------------------------------------------------------------------- /scripts/cover.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | cp $DIR/../config.py $DIR/../vlmeval/ 4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/ -------------------------------------------------------------------------------- /scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | export GPU=$(nvidia-smi --list-gpus | wc -l) 4 | torchrun --nproc-per-node=$GPU run.py ${@:1} -------------------------------------------------------------------------------- /scripts/srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2} -------------------------------------------------------------------------------- /vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | import ssl 2 | ssl._create_default_https_context = ssl._create_unverified_context 3 | # Temporarily bypass SSL certificate verification to download files from oss. 4 | 5 | try: 6 | import torch 7 | except ImportError: 8 | pass 9 | 10 | from .smp import * 11 | from .api import * 12 | from .dataset import * 13 | from .utils import * 14 | from .vlm import * 15 | from .config import * 16 | from .tools import cli 17 | 18 | load_env() 19 | 20 | __version__ = '0.2rc1' 21 | -------------------------------------------------------------------------------- /vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .hf_chat_model import HFChatModel 3 | from .gemini import GeminiWrapper, Gemini 4 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI 5 | from .qwen_api import QwenAPI 6 | from .claude import Claude_Wrapper, Claude3V 7 | from .reka import Reka 8 | from .glm_vision import GLMVisionAPI 9 | from .cloudwalk import CWWrapper 10 | from .sensechat_vision import SenseChatVisionAPI 11 | from .siliconflow import SiliconFlowAPI, TeleMMAPI 12 | from .hunyuan import HunyuanVision 13 | from .bailingmm import bailingMMAPI 14 | from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API 15 | from .jt_vl_chat import JTVLChatAPI 16 | from .taiyi import TaiyiAPI 17 | from .lmdeploy import LMDeployAPI 18 | from .taichu import TaichuVLAPI, TaichuVLRAPI 19 | from .doubao_vl_api import DoubaoVL 20 | from .mug_u import MUGUAPI 21 | 22 | __all__ = [ 23 | 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', 24 | 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 25 | 'Reka', 'GLMVisionAPI', 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 26 | 'Qwen2VLAPI', 'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI', 27 | 'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI', 28 | 'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI" 29 | ] 30 | -------------------------------------------------------------------------------- /vlmeval/api/glm_vision.py: -------------------------------------------------------------------------------- 1 | import requests 2 | requests.packages.urllib3.disable_warnings() 3 | 4 | from vlmeval.smp import * 5 | from vlmeval.api.base import BaseAPI 6 | from vlmeval.dataset import DATASET_TYPE 7 | from vlmeval.smp.vlm import encode_image_file_to_base64 8 | 9 | 10 | class GLMVisionWrapper(BaseAPI): 11 | 12 | is_api: bool = True 13 | 14 | def __init__(self, 15 | model: str, 16 | retry: int = 5, 17 | wait: int = 5, 18 | key: str = None, 19 | verbose: bool = True, 20 | system_prompt: str = None, 21 | max_tokens: int = 4096, 22 | proxy: str = None, 23 | **kwargs): 24 | 25 | from zhipuai import ZhipuAI 26 | self.model = model 27 | self.fail_msg = 'Failed to obtain answer via API. ' 28 | if key is None: 29 | key = os.environ.get('GLMV_API_KEY', None) 30 | assert key is not None, ( 31 | 'Please set the API Key (obtain it here: ' 32 | 'https://bigmodel.cn)' 33 | ) 34 | self.client = ZhipuAI(api_key=key) 35 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 36 | 37 | def build_msgs(self, msgs_raw, system_prompt=None, dataset=None): 38 | msgs = cp.deepcopy(msgs_raw) 39 | content = [] 40 | for i, msg in enumerate(msgs): 41 | if msg['type'] == 'text': 42 | content.append(dict(type='text', text=msg['value'])) 43 | elif msg['type'] == 'image': 44 | content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value'])))) 45 | if dataset in {'HallusionBench', 'POPE'}: 46 | content.append(dict(type="text", text="Please answer yes or no.")) 47 | ret = [dict(role='user', content=content)] 48 | return ret 49 | 50 | def generate_inner(self, inputs, **kwargs) -> str: 51 | assert isinstance(inputs, str) or isinstance(inputs, list) 52 | inputs = [inputs] if isinstance(inputs, str) else inputs 53 | 54 | messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None)) 55 | 56 | response = self.client.chat.completions.create( 57 | model=self.model, 58 | messages=messages, 59 | do_sample=False, 60 | max_tokens=2048 61 | ) 62 | try: 63 | answer = response.choices[0].message.content.strip() 64 | if self.verbose: 65 | self.logger.info(f'inputs: {inputs}\nanswer: {answer}') 66 | return 0, answer, 'Succeeded!' 67 | except Exception as err: 68 | if self.verbose: 69 | self.logger.error(f'{type(err)}: {err}') 70 | self.logger.error(f'The input messages are {inputs}.') 71 | return -1, self.fail_msg, '' 72 | 73 | 74 | class GLMVisionAPI(GLMVisionWrapper): 75 | 76 | def generate(self, message, dataset=None): 77 | return super(GLMVisionAPI, self).generate(message, dataset=dataset) 78 | -------------------------------------------------------------------------------- /vlmeval/api/qwen_api.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | import os 3 | from vlmeval.api.base import BaseAPI 4 | from vlmeval.smp import * 5 | 6 | 7 | # Note: This is a pure language model API. 8 | class QwenAPI(BaseAPI): 9 | 10 | is_api: bool = True 11 | 12 | def __init__(self, 13 | model: str = 'qwen-max-1201', 14 | retry: int = 5, 15 | wait: int = 5, 16 | verbose: bool = True, 17 | seed: int = 2680, 18 | temperature: float = 0.0, 19 | system_prompt: str = None, 20 | key: str = None, 21 | max_tokens: int = 2048, 22 | proxy: str = None, 23 | **kwargs): 24 | 25 | assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext'] 26 | self.model = model 27 | import dashscope 28 | self.fail_msg = 'Failed to obtain answer via API. ' 29 | self.max_tokens = max_tokens 30 | self.temperature = temperature 31 | self.seed = seed 32 | if key is None: 33 | key = os.environ.get('DASHSCOPE_API_KEY', None) 34 | assert key is not None, ( 35 | 'Please set the API Key (obtain it here: ' 36 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 37 | ) 38 | dashscope.api_key = key 39 | if proxy is not None: 40 | proxy_set(proxy) 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | @staticmethod 44 | def build_msgs(msgs_raw, system_prompt=None): 45 | msgs = cp.deepcopy(msgs_raw) 46 | ret = [] 47 | if system_prompt is not None: 48 | ret.append(dict(role='system', content=system_prompt)) 49 | for i, msg in enumerate(msgs): 50 | role = 'user' if i % 2 == 0 else 'assistant' 51 | ret.append(dict(role=role, content=msg)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | inputs = [inputs] if isinstance(inputs, str) else inputs 58 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 59 | 60 | import dashscope 61 | response = dashscope.Generation.call( 62 | model=self.model, 63 | messages=messages, 64 | seed=self.seed, 65 | temperature=self.temperature, 66 | max_tokens=self.max_tokens, 67 | result_format='message', # set the result to be "message" format. 68 | ) 69 | if response.status_code != HTTPStatus.OK: 70 | return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. ' 71 | 72 | try: 73 | return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! ' 74 | except Exception as err: 75 | return -1, f'Error: Failed to parse the response. {err}', response 76 | -------------------------------------------------------------------------------- /vlmeval/api/reka.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import mimetypes 5 | 6 | 7 | class Reka_Wrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | INTERLEAVE: bool = False 11 | 12 | def __init__(self, 13 | model: str = 'reka-flash-20240226', 14 | key: str = None, 15 | retry: int = 10, 16 | wait: int = 3, 17 | system_prompt: str = None, 18 | verbose: bool = True, 19 | temperature: float = 0, 20 | max_tokens: int = 1024, 21 | **kwargs): 22 | 23 | try: 24 | import reka 25 | except ImportError: 26 | raise ImportError('Please install reka by running "pip install reka-api"') 27 | 28 | self.model = model 29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) 30 | default_kwargs.update(kwargs) 31 | self.kwargs = default_kwargs 32 | if key is not None: 33 | self.key = key 34 | else: 35 | self.key = os.environ.get('REKA_API_KEY', '') 36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 37 | 38 | def generate_inner(self, inputs, **kwargs) -> str: 39 | import reka 40 | reka.API_KEY = self.key 41 | dataset = kwargs.pop('dataset', None) 42 | prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset) 43 | image_b64 = encode_image_file_to_base64(image_path) 44 | 45 | response = reka.chat( 46 | model_name=self.model, 47 | human=prompt, 48 | media_url=f'data:image/jpeg;base64,{image_b64}', 49 | **self.kwargs) 50 | 51 | try: 52 | return 0, response['text'], response 53 | except Exception as err: 54 | return -1, self.fail_msg + str(err), response 55 | 56 | 57 | class Reka(Reka_Wrapper): 58 | 59 | def generate(self, message, dataset=None): 60 | return super(Reka_Wrapper, self).generate(message) 61 | -------------------------------------------------------------------------------- /vlmeval/api/stepai.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | url = 'https://api.stepfun.com/v1/chat/completions' 5 | headers = { 6 | 'Content-Type': 'application/json', 7 | 'Authorization': 'Bearer {}', 8 | } 9 | 10 | 11 | class StepAPI_INT(BaseAPI): 12 | 13 | is_api: bool = True 14 | 15 | def __init__(self, 16 | model: str = 'step-1v-8k', 17 | retry: int = 10, 18 | wait: int = 3, 19 | key: str = None, 20 | temperature: float = 0, 21 | max_tokens: int = 300, 22 | verbose: bool = True, 23 | system_prompt: str = None, 24 | **kwargs): 25 | self.model = model 26 | self.fail_msg = 'Fail to obtain answer via API.' 27 | self.headers = headers 28 | self.temperature = temperature 29 | self.max_tokens = max_tokens 30 | self.system_prompt = system_prompt 31 | if key is not None: 32 | self.key = key 33 | else: 34 | self.key = os.environ.get('STEPAI_API_KEY', '') 35 | headers['Authorization'] = headers['Authorization'].format(self.key) 36 | 37 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 38 | 39 | @staticmethod 40 | def build_msgs(msgs_raw): 41 | messages = [] 42 | message = {'role': 'user', 'content': []} 43 | 44 | for msg in msgs_raw: 45 | if msg['type'] == 'image': 46 | image_b64 = encode_image_file_to_base64(msg['value']) 47 | message['content'].append({ 48 | 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)}, 49 | 'type': 'image_url' 50 | }) 51 | elif msg['type'] == 'text': 52 | message['content'].append({ 53 | 'text': msg['value'], 54 | 'type': 'text' 55 | }) 56 | 57 | messages.append(message) 58 | return messages 59 | 60 | def generate_inner(self, inputs, **kwargs) -> str: 61 | print(inputs, '\n') 62 | payload = dict( 63 | model=self.model, 64 | max_tokens=self.max_tokens, 65 | temperature=self.temperature, 66 | messages=self.build_msgs(msgs_raw=inputs), 67 | **kwargs) 68 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 69 | ret_code = response.status_code 70 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 71 | 72 | answer = self.fail_msg 73 | try: 74 | resp_struct = json.loads(response.text) 75 | answer = resp_struct['choices'][0]['message']['content'].strip() 76 | except Exception as err: 77 | if self.verbose: 78 | self.logger.error(f'{type(err)}: {err}') 79 | self.logger.error(response.text if hasattr(response, 'text') else response) 80 | 81 | return ret_code, answer, response 82 | 83 | 84 | class Step1V_INT(StepAPI_INT): 85 | 86 | def generate(self, message, dataset=None): 87 | return super(StepAPI_INT, self).generate(message) 88 | -------------------------------------------------------------------------------- /vlmeval/dataset/GUI/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/GUI/__init__.py -------------------------------------------------------------------------------- /vlmeval/dataset/Omnidocbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/Omnidocbench/__init__.py -------------------------------------------------------------------------------- /vlmeval/dataset/Omnidocbench/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.26.0 2 | apted 3 | BeautifulSoup4 4 | evaluate 5 | func_timeout 6 | jmespath 7 | Levenshtein 8 | lxml 9 | nltk 10 | pylatexenc 11 | qwen_vl_utils 12 | scipy 13 | torchvision 14 | -------------------------------------------------------------------------------- /vlmeval/dataset/emma.py: -------------------------------------------------------------------------------- 1 | from vlmeval import * 2 | from .image_shortqa import ImageShortQADataset 3 | from .image_mcq import MMMUDataset 4 | 5 | 6 | class EMMADataset(ImageShortQADataset): 7 | 8 | COT_INST = "Please solve the problem step by step. " 9 | DIRECT_INST = "Please ensure that your output only contains the final answer without any additional content (such as intermediate reasoning steps)." # noqa: E501 10 | MCQ_FMT = "{context}\n\n{question}\n\n{options}\n\nAnswer with the option's letter from the given choices. " 11 | OPEN_FMT = "{context}\n\n{question}\n\nAnswer the question using a single word or phrase. " 12 | 13 | DATASET_URL = { 14 | 'EMMA': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv', 15 | 'EMMA_COT': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv' 16 | } 17 | 18 | def build_prompt(self, line): 19 | if isinstance(line, int): 20 | line = self.data.iloc[line] 21 | 22 | if self.meta_only: 23 | tgt_path = toliststr(line['image_path']) 24 | else: 25 | tgt_path = self.dump_image(line) 26 | 27 | context = line['context'] 28 | question = line['question'] 29 | example = "" 30 | _ = {} 31 | if line['type'] == 'MCQ': 32 | for ch in string.ascii_uppercase: 33 | if ch in line and not pd.isna(line[ch]): 34 | example += f"{ch}: {line[ch]}\n" 35 | 36 | prompt_tmpl = EMMADataset.MCQ_FMT 37 | if not pd.isna(context) and context is not None: 38 | prompt = prompt_tmpl.format(context=context, question=question, options=example) 39 | else: 40 | prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question, options=example) 41 | prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST 42 | else: 43 | prompt_tmpl = EMMADataset.OPEN_FMT 44 | if not pd.isna(context) and context is not None: 45 | prompt = prompt_tmpl.format(context=context, question=question) 46 | else: 47 | prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question) 48 | prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST 49 | 50 | msgs = [] 51 | if isinstance(tgt_path, list): 52 | msgs.extend([dict(type='image', value=p) for p in tgt_path]) 53 | else: 54 | msgs = [dict(type='image', value=tgt_path)] 55 | msgs.append(dict(type='text', value=prompt)) 56 | return MMMUDataset.split_MMMU(msgs) 57 | -------------------------------------------------------------------------------- /vlmeval/dataset/image_caption.py: -------------------------------------------------------------------------------- 1 | from .image_base import ImageBaseDataset 2 | from ..smp import * 3 | 4 | 5 | class COCO_Caption_Scorer(): 6 | def __init__(self, ref, gt): 7 | from pycocoevalcap.bleu.bleu import Bleu 8 | from pycocoevalcap.rouge.rouge import Rouge 9 | from pycocoevalcap.cider.cider import Cider 10 | 11 | self.ref = ref 12 | self.gt = gt 13 | print('setting up scorers...') 14 | self.scorers = [ 15 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), 16 | (Rouge(), 'ROUGE_L'), 17 | (Cider(), 'CIDEr'), 18 | ] 19 | 20 | def compute_scores(self): 21 | total_scores = {} 22 | for scorer, method in self.scorers: 23 | print('computing %s score...' % (scorer.method())) 24 | score, scores = scorer.compute_score(self.gt, self.ref) 25 | if isinstance(method, list): 26 | for sc, scs, m in zip(score, scores, method): 27 | print('%s: %0.3f' % (m, sc * 100)) 28 | total_scores['Bleu'] = [x * 100 for x in score] 29 | else: 30 | print('%s: %0.3f' % (method, score * 100)) 31 | total_scores[method] = score * 100 32 | 33 | print('*****DONE*****') 34 | for key, value in total_scores.items(): 35 | print('{}:{}'.format(key, value)) 36 | return total_scores 37 | 38 | 39 | class ImageCaptionDataset(ImageBaseDataset): 40 | 41 | TYPE = 'Caption' 42 | 43 | DATASET_URL = { 44 | 'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv', 45 | } 46 | 47 | DATASET_MD5 = { 48 | 'COCO_VAL': '72a5079dead060269ac222c5aa5128af', 49 | } 50 | 51 | def load_data(self, dataset): 52 | data = super().load_data(dataset) 53 | if 'question' not in data: 54 | data['question'] = [( 55 | 'Please describe this image in general. Directly provide the description, ' 56 | 'do not include prefix like "This image depicts". ' 57 | )] * len(data) 58 | return data 59 | 60 | # It returns a dictionary of scores 61 | @classmethod 62 | def evaluate(self, eval_file, **kwargs): 63 | data = load(eval_file) 64 | lt = len(data) 65 | lines = [data.iloc[i] for i in range(lt)] 66 | ref, gt = {}, {} 67 | for i, line in enumerate(lines): 68 | ref[str(i)] = [str(line['prediction'])] 69 | gt[str(i)] = eval(line['answer']) 70 | 71 | scorer = COCO_Caption_Scorer(ref, gt) 72 | coco_caption_score_dict = scorer.compute_scores() 73 | score_pth = eval_file.replace('.xlsx', '_score.json') 74 | dump(coco_caption_score_dict, score_pth) 75 | return coco_caption_score_dict 76 | -------------------------------------------------------------------------------- /vlmeval/dataset/mmgenbench.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import pandas as pd 3 | from abc import abstractmethod 4 | from ..smp import * 5 | from .image_base import ImageBaseDataset 6 | 7 | 8 | class MMGenBench(ImageBaseDataset): 9 | 10 | prompt_list = [ 11 | """ 12 | # Role 13 | You are an expert in the field of image understanding, focusing on the \ 14 | understanding of images and generating the image caption-prompt. 15 | 16 | # Definition Explanation 17 | image caption-prompt: Refers to the caption or description of an image, \ 18 | used to provide to a Text-to-Image model to generate a new image. 19 | Text-to-Image model: Can generate a new image based on the provided image \ 20 | caption-prompt, such as stable diffusion 3, flux, and other image generation models. 21 | 22 | # Task Description 23 | Generate an image caption-prompt based on the input image. 24 | 25 | # Key Points and Requirements 26 | 1. Accurately understand the input image and precisely generate an image caption-prompt. 27 | 2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \ 28 | Text-to-Image model to generate a new image that is as consistent as possible with the input image. 29 | 3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model. 30 | 4. The generated image caption-prompt should describe the input image in as much \ 31 | detail as possible, and it should be between 20 to 60 words. 32 | 33 | # Output Format 34 | A string, that is the image caption-prompt. No extra output needed. 35 | """ 36 | ] 37 | TYPE = 'GenerateImgPrompt' 38 | DATASET_URL = { 39 | 'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv', 40 | 'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv', 41 | } 42 | PROMPT_MAP = { 43 | 'MMGenBench-Test': prompt_list[0], 44 | 'MMGenBench-Domain': prompt_list[0], 45 | } 46 | DATASET_MD5 = { 47 | 'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da", 48 | 'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb", 49 | } 50 | 51 | def __init__(self, dataset='MMGenBench', **kwargs): 52 | super().__init__(dataset, **kwargs) 53 | warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n') 54 | warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n') 55 | 56 | def load_data(self, dataset): 57 | data = super().load_data(dataset) 58 | if 'question' not in data: 59 | data['question'] = [( 60 | self.PROMPT_MAP[dataset] 61 | )] * len(data) 62 | return data 63 | 64 | # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe 65 | @abstractmethod 66 | def evaluate(self, eval_file, **judge_kwargs): 67 | warnings.warn('This evaluation method is not supported.\n') 68 | warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n') 69 | return None 70 | -------------------------------------------------------------------------------- /vlmeval/dataset/text_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from ..smp import * 3 | 4 | 5 | class TextBaseDataset: 6 | MODALITY = 'TEXT' 7 | DATASET_URL = {} 8 | DATASET_MD5 = {} 9 | 10 | def __init__(self, dataset='MMBench', **kwargs): 11 | self.dataset_name = dataset 12 | 13 | data = self.load_data(dataset) 14 | 15 | data['index'] = [str(x) for x in data['index']] 16 | 17 | if np.all([istype(x, int) for x in data['index']]): 18 | data['index'] = [int(x) for x in data['index']] 19 | 20 | self.data = data 21 | self.post_build(dataset) 22 | 23 | def __len__(self): 24 | return len(self.data) 25 | 26 | def __getitem__(self, idx): 27 | return dict(self.data.iloc[idx]) 28 | 29 | def prepare_tsv(self, url, file_md5=None): 30 | data_root = LMUDataRoot() 31 | os.makedirs(data_root, exist_ok=True) 32 | update_flag = False 33 | file_name = url.split('/')[-1] 34 | data_path = osp.join(data_root, file_name) 35 | if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): 36 | pass 37 | else: 38 | warnings.warn('The dataset tsv is not downloaded') 39 | download_file(url, data_path) 40 | update_flag = True 41 | 42 | if file_size(data_path, 'GB') > 1: 43 | local_path = data_path.replace('.tsv', '_local.tsv') 44 | if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag: 45 | from ..tools import LOCALIZE 46 | LOCALIZE(data_path, local_path) 47 | data_path = local_path 48 | return load(data_path) 49 | 50 | def dump_image(self, line): 51 | return [] 52 | 53 | def display(self, line): 54 | if isinstance(line, int): 55 | line = self.data.iloc[line] 56 | assert isinstance(line, pd.Series) or isinstance(line, dict) 57 | mmqa_display(line) 58 | 59 | # Return a list of dataset names that are supported by this class, can override 60 | @classmethod 61 | def supported_datasets(cls): 62 | return list(cls.DATASET_URL) 63 | 64 | # Given the dataset name, return the dataset as a pandas dataframe, can override 65 | def load_data(self, dataset): 66 | url = self.DATASET_URL[dataset] 67 | file_md5 = self.DATASET_MD5[dataset] 68 | return self.prepare_tsv(url, file_md5) 69 | 70 | # Post built hook, will be called after the dataset is built, can override 71 | def post_build(self, dataset): 72 | pass 73 | 74 | # Given one data record, return the built prompt (a multi-modal message), can override 75 | def build_prompt(self, line): 76 | if isinstance(line, int): 77 | line = self.data.iloc[line] 78 | 79 | question = line['question'] 80 | 81 | msgs = [] 82 | msgs.append(dict(type='text', value=question)) 83 | return msgs 84 | 85 | # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe 86 | @abstractmethod 87 | def evaluate(self, eval_file, **judge_kwargs): 88 | pass 89 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .judge_util import build_judge, DEBUG_MESSAGE 2 | from .multiple_choice import extract_answer_from_item, prefetch_answer 3 | from .vqa_eval import levenshtein_distance 4 | from .spatial457 import Spatial457_utils 5 | 6 | 7 | __all__ = [ 8 | 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 9 | 'levenshtein_distance', 'DEBUG_MESSAGE', 10 | 'Spatial457_utils' 11 | ] 12 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/ccocr_evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from .kie_evaluator import KieEvaluator 2 | from .doc_parsing_evaluator import ParsingEvaluator 3 | from .ocr_evaluator import OcrEvaluator 4 | from .common import summary 5 | 6 | 7 | evaluator_map_info = { 8 | "kie": KieEvaluator("kie"), 9 | "doc_parsing": ParsingEvaluator("doc_parsing"), 10 | "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"), 11 | "multi_scene_ocr": OcrEvaluator("multi_scene_ocr") 12 | } 13 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/crpe.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from collections import defaultdict 4 | 5 | 6 | def is_correct(predict, answer): 7 | # predict是标准答案 answer是预测 8 | if len(answer) == 1: 9 | return answer[0] == predict[0] 10 | elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']: 11 | return answer[0] == predict[0] 12 | elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']: 13 | return predict[4:].lower() in answer.lower() 14 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/hrbench.py: -------------------------------------------------------------------------------- 1 | from ...smp import * 2 | import os 3 | 4 | 5 | def report_acc_hrbench(df): 6 | cycle_group = df.groupby('cycle_category') 7 | result_dic = defaultdict(list) 8 | avg_dic = defaultdict(int) 9 | 10 | count = 0 11 | for key, data_value in cycle_group: 12 | count += 1 13 | _, resp_dic = hrbench_score(data_value) 14 | 15 | for task_type, accuracy in resp_dic.items(): 16 | result_dic['cycle'].append(key) 17 | result_dic['type'].append(task_type) 18 | result_dic['accuracy'].append(accuracy) 19 | 20 | avg_dic[task_type] += accuracy 21 | for task_type, accuracy in avg_dic.items(): 22 | result_dic['cycle'].append('Average') 23 | result_dic['type'].append(task_type) 24 | result_dic['accuracy'].append(accuracy / count) 25 | result_pd = pd.DataFrame(result_dic) 26 | 27 | return result_pd 28 | 29 | 30 | def hrbench_score(data): 31 | ret = defaultdict(list) 32 | resp_dic = {} 33 | category_list = set(data['category']) 34 | score_dict = defaultdict(list) 35 | 36 | for i in range(len(data)): 37 | d = data.iloc[i] 38 | category = d['category'] 39 | gpt_score = d['hit'] 40 | score_dict[category].append(gpt_score) 41 | score_dict['all'].append(gpt_score) 42 | 43 | all_acc = np.mean(score_dict['all']) 44 | ret['type'].append('all') 45 | ret['acc'].append(all_acc) 46 | resp_dic['all'] = all_acc 47 | for cate in category_list: 48 | acc = np.mean(score_dict[cate]) 49 | ret['type'].append(cate) 50 | ret['acc'].append(acc) 51 | 52 | resp_dic[cate] = acc 53 | 54 | return pd.DataFrame(ret), resp_dic 55 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/judge_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ...smp import load_env 3 | 4 | INTERNAL = os.environ.get('INTERNAL', 0) 5 | 6 | 7 | def build_judge(**kwargs): 8 | from ...api import OpenAIWrapper, SiliconFlowAPI, HFChatModel 9 | model = kwargs.pop('model', None) 10 | kwargs.pop('nproc', None) 11 | load_env() 12 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 13 | if LOCAL_LLM is None: 14 | model_map = { 15 | 'gpt-4-turbo': 'gpt-4-1106-preview', 16 | 'gpt-4-0613': 'gpt-4-0613', 17 | 'gpt-4-0125': 'gpt-4-0125-preview', 18 | 'gpt-4-0409': 'gpt-4-turbo-2024-04-09', 19 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125', 21 | 'gpt-4o': 'gpt-4o-2024-05-13', 22 | 'gpt-4o-0806': 'gpt-4o-2024-08-06', 23 | 'gpt-4o-mini': 'gpt-4o-mini-2024-07-18', 24 | 'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct', 25 | 'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct', 26 | 'deepseek': 'deepseek-ai/DeepSeek-V3', 27 | 'llama31-8b': 'meta-llama/Llama-3.1-8B-Instruct', 28 | } 29 | model_version = model_map[model] 30 | else: 31 | model_version = LOCAL_LLM 32 | 33 | if model in ['qwen-7b', 'qwen-72b', 'deepseek']: 34 | model = SiliconFlowAPI(model_version, **kwargs) 35 | elif model == 'llama31-8b': 36 | model = HFChatModel(model_version, **kwargs) 37 | else: 38 | model = OpenAIWrapper(model_version, **kwargs) 39 | return model 40 | 41 | 42 | DEBUG_MESSAGE = """ 43 | To debug the OpenAI API, you can try the following scripts in python: 44 | ```python 45 | from vlmeval.api import OpenAIWrapper 46 | model = OpenAIWrapper('gpt-4o', verbose=True) 47 | msgs = [dict(type='text', value='Hello!')] 48 | code, answer, resp = model.generate_inner(msgs) 49 | print(code, answer, resp) 50 | ``` 51 | You cam see the specific error if the API call fails. 52 | """ 53 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/longvideobench.py: -------------------------------------------------------------------------------- 1 | from ...smp import * 2 | from .multiple_choice import extract_answer_from_item 3 | import numpy as np 4 | import re 5 | 6 | FAIL_MSG = 'Failed to obtain answer via API.' 7 | 8 | DURATIONS = [15, 60, 600, 3600] 9 | TASK_CATEGORIES = [ 10 | "S2E", "S2O", "S2A", 11 | "E2O", "O2E", "T2E", 12 | "T2O", "T2A", "E3E", 13 | "O3O", "SSS", "SOS", 14 | "SAA", "T3E", "T3O", 15 | "TOS", "TAA" 16 | ] 17 | 18 | 19 | def get_dimension_rating(data_path): 20 | data = load(data_path) 21 | print(data.iloc[0]) 22 | 23 | duration_rating = {k: {} for k in DURATIONS} 24 | for duration in DURATIONS + ['overall']: 25 | duration_rating[duration] = { 26 | 'overall': '', 27 | 'question_category': {k: [] for k in TASK_CATEGORIES} 28 | } 29 | 30 | for i in range(len(data)): 31 | 32 | task_ctg = data.iloc[i]['question_category'] 33 | 34 | duration = data.iloc[i]['duration_group'] 35 | duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score']) 36 | 37 | duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score']) 38 | 39 | for duration in DURATIONS + ['overall']: 40 | overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}' # noqa: E501 41 | duration_rating[duration]['overall'] = overall_res_dur 42 | for task_ctg in TASK_CATEGORIES: 43 | task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}' # noqa: E501 44 | duration_rating[duration]['question_category'][task_ctg] = task_res_dur 45 | 46 | return duration_rating 47 | 48 | 49 | def extract_option(model, input_item, dataset_name): 50 | options = input_item['question'].split('\n')[1:] 51 | for id, option in enumerate(options): 52 | option_id = chr(ord('A') + id) + '.' 53 | if option.find(option_id) >= 0: 54 | input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n') 55 | return extract_answer_from_item(model, input_item, dataset_name)['opt'] 56 | 57 | 58 | def extract_characters_regex(s): 59 | s = s.strip() 60 | answer_prefixes = [ 61 | 'The best answer is', 62 | 'The correct answer is', 63 | 'The answer is', 64 | 'The answer', 65 | 'The best option is' 66 | 'The correct option is', 67 | 'Best answer:' 68 | 'Best option:', 69 | 'Answer:', 70 | 'Option:', 71 | ] 72 | for answer_prefix in answer_prefixes: 73 | s = s.replace(answer_prefix, '') 74 | 75 | if len(s.split()) > 10 and not re.search('[ABCDE]', s): 76 | return '' 77 | matches = re.search(r'[ABCDE]', s) 78 | if matches is None: 79 | return '' 80 | return matches[0] 81 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/__init__.py: -------------------------------------------------------------------------------- 1 | from .aggregation_type import AggregationType 2 | from .metric_type import MetricType 3 | from .response_parse_type import ResponseParseType 4 | 5 | __all__ = [AggregationType, MetricType, ResponseParseType] 6 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/aggregation/mean_agg.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import Dict 3 | import numpy as np 4 | 5 | 6 | class MeanAggregation: 7 | """Take the mean of all valid scores.""" 8 | 9 | @staticmethod 10 | def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number: 11 | """Exact match between targets and responses.""" 12 | filtered_scores = {f: s for f, s in scores.items() if s >= 0} 13 | if not filtered_scores: 14 | return -1 15 | 16 | # Align the key order 17 | flattened_scores = [] 18 | flattened_weights = [] 19 | for field in filtered_scores: 20 | flattened_scores.append(filtered_scores[field]) 21 | flattened_weights.append(weights[field]) 22 | return np.average(flattened_scores, weights=flattened_weights) 23 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/aggregation/min_agg.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import Dict 3 | 4 | 5 | class MinAggregation: 6 | """Take the minimum of all valid scores.""" 7 | 8 | @staticmethod 9 | def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number: 10 | """Exact match between targets and responses.""" 11 | filtered_scores = [s for s in scores.values() if s >= 0] 12 | if not filtered_scores: 13 | return -1 14 | return min(filtered_scores) 15 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/aggregation/unsupported_agg.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from typing import Dict 3 | 4 | 5 | class UnsupportedAggregation: 6 | @staticmethod 7 | def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number: 8 | return -1 9 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/aggregation_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class AggregationType(Enum): 4 | MEAN = 0 5 | 6 | @classmethod 7 | def from_string(cls, s): 8 | return cls.MEAN 9 | 10 | def aggregate(self, field_scores, field_weights): 11 | if not field_scores: 12 | return 0.0 13 | 14 | total_score = 0.0 15 | total_weight = 0.0 16 | 17 | for field, score in field_scores.items(): 18 | weight = field_weights.get(field, 1.0) 19 | try: 20 | total_score += score * weight 21 | except: 22 | total_score += score[0] * weight 23 | total_weight += weight 24 | 25 | return total_score / total_weight if total_weight > 0 else 0.0 26 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/parsing/dummy_parse.py: -------------------------------------------------------------------------------- 1 | class DummyParse: 2 | 3 | @staticmethod 4 | def parse(response: str, *args, **kwargs) -> dict: 5 | """return the raw string without doing anything""" 6 | return response.strip() 7 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/parsing/json_parse.py: -------------------------------------------------------------------------------- 1 | from .common.parsers import parse_json 2 | from .common.utils import evaluate_as_string 3 | 4 | 5 | class JsonParse: 6 | """Load the response as a JSON object.""" 7 | 8 | @staticmethod 9 | def parse(response: str): 10 | """Parse the JSON object, including nested JSON strings.""" 11 | parsed_res = parse_json(response) 12 | # Drop the potentially duplicated string quotes 13 | if isinstance(parsed_res, dict): 14 | for key, val in parsed_res.items(): 15 | parsed_res[key] = evaluate_as_string(val) 16 | 17 | return parsed_res 18 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/requirements.txt: -------------------------------------------------------------------------------- 1 | antlr4-python3-runtime==4.11.0 2 | filelock==3.16.1 3 | geopy==2.4.1 4 | jieba==0.42.1 5 | nltk==3.9.1 6 | numpy==1.26.4 7 | pronouncing==0.2.0 8 | rapidfuzz==3.9.5 9 | regex==2024.7.24 10 | requests==2.32.3 11 | requests_cache==1.2.1 12 | sacrebleu==2.4.3 13 | sympy==1.13.2 14 | tqdm==4.66.4 15 | Unidecode==1.3.8 16 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/response_parse_type.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from enum import Enum 3 | from .parsing.json_parse import JsonParse 4 | from .parsing.answer_str_parse import ( 5 | AnswerStrParse, 6 | AsciiAnswerStrParse, 7 | VerbatimAnswerStrParse, 8 | ) 9 | from vlmeval.dataset.utils.megabench.parsing.dummy_parse import DummyParse 10 | 11 | 12 | class ResponseParseType(Enum): 13 | """Parse the response.""" 14 | 15 | JSON = "json" 16 | ANSWER_STR = "answer_string" 17 | ASCII_ANSWER_STR = "ascii_answer_string" 18 | VERBATIM_ANSWER_STR = "verbatim_answer_string" 19 | DUMMY = "dummy" 20 | UNSUPPORTED = "unsupported" 21 | 22 | @cached_property 23 | def class_impl(self): 24 | if self == ResponseParseType.ANSWER_STR: 25 | return AnswerStrParse 26 | elif self == ResponseParseType.ASCII_ANSWER_STR: 27 | return AsciiAnswerStrParse 28 | elif self == ResponseParseType.VERBATIM_ANSWER_STR: 29 | return VerbatimAnswerStrParse 30 | elif self == ResponseParseType.DUMMY: 31 | return DummyParse 32 | else: 33 | return JsonParse 34 | 35 | def is_single_field_parser(self): 36 | return self in [ 37 | ResponseParseType.ANSWER_STR, 38 | ResponseParseType.ASCII_ANSWER_STR, 39 | ResponseParseType.VERBATIM_ANSWER_STR, 40 | ] 41 | 42 | def parse(self, response: str, *args, **kwargs): 43 | """Parse the response.""" 44 | return self.class_impl.parse(response, *args, **kwargs) 45 | 46 | @staticmethod 47 | def from_string(s): 48 | """Initialize the response parsing type from a string.""" 49 | try: 50 | if s is None: 51 | return ResponseParseType("unsupported") 52 | return ResponseParseType(s.lower()) 53 | except KeyError as exc: 54 | raise ValueError(f"Invalid metric type: {s}") from exc 55 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/chess_jaccard.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, Any 3 | from .common.conversions import str_to_set 4 | from .common.metrics import jaccard_index 5 | 6 | 7 | def chess_transform(move_sequence: str) -> set: 8 | """Transform a sequence of chess moves encoded in SAN into a set.""" 9 | move_sequence = str_to_set(move_sequence) 10 | return {move_san.removesuffix("!").removesuffix("#") for move_san in move_sequence} 11 | 12 | 13 | class ChessMoveJaccard: 14 | """Calculates the Jacard index for chess moves.""" 15 | 16 | @classmethod 17 | def match(cls, responses: str | None, targets: str) -> float: 18 | """Exact match between targets and responses.""" 19 | if responses is None: 20 | return 0 21 | responses = chess_transform(responses) 22 | targets = chess_transform(targets) 23 | 24 | return jaccard_index(responses, targets) 25 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/coordinate_sequence_match.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .common.conversions import str_to_coords 3 | import numpy as np 4 | 5 | 6 | class CoordsSequenceSimilarity: 7 | """ 8 | Measure the similarity between two list of coordinates, used for keypoint estimation tasks 9 | """ 10 | 11 | @staticmethod 12 | def compute_score(pred_keypoints, gt_keypoints, k=10): 13 | """ 14 | Compute the evaluation score for keypoint estimation. 15 | 16 | Args: 17 | pred_keypoints (list or np.ndarray): List or array of predicted keypoint coordinates, 18 | each as (x, y), normalized to [0, 1]. 19 | gt_keypoints (list or np.ndarray): List or array of ground truth keypoint coordinates, 20 | each as (x, y), normalized to [0, 1]. 21 | 22 | Returns: 23 | float: A score between 0 and 1, where 1 indicates perfect accuracy, 24 | and 0 indicates completely wrong. 25 | """ 26 | # Convert inputs to NumPy arrays 27 | try: 28 | pred_keypoints = np.array(pred_keypoints) 29 | except ValueError: 30 | # Format is not a correct 31 | return 0 32 | 33 | gt_keypoints = np.array(gt_keypoints) 34 | 35 | # shape mismatch, directly assign 0 score 36 | if pred_keypoints.shape != gt_keypoints.shape: 37 | return 0 38 | 39 | # Compute Euclidean distances between corresponding keypoints 40 | distances = np.linalg.norm(pred_keypoints - gt_keypoints, axis=1) 41 | 42 | # Maximum possible distance in normalized coordinate space 43 | max_distance = np.sqrt(2) 44 | 45 | # Normalize distances 46 | normalized_distances = distances / max_distance 47 | 48 | # Compute per-keypoint scores using exponential decay 49 | per_keypoint_scores = np.exp(-k * normalized_distances) 50 | 51 | # Compute the average score across all keypoints 52 | score = np.mean(per_keypoint_scores) 53 | 54 | return score 55 | 56 | @classmethod 57 | def match(cls, responses, targets) -> float: 58 | """Exact match between targets and responses.""" 59 | logging.debug(f"{responses=}, {targets=}") 60 | if not isinstance(responses, (tuple | list)): 61 | responses = str_to_coords(responses, dim=2) 62 | if not isinstance(targets, (tuple | list)): 63 | targets = str_to_coords(targets, dim=2) 64 | 65 | return cls.compute_score(responses, targets) 66 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/dict_equality.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import cast_to_dict 2 | from .simple_str_match import ExactStrMatch 3 | 4 | 5 | class DictEquality: 6 | """Calculates the exact string match across the dict. 7 | 8 | 1. Calculates the exact match for all keys in the solution 9 | 2. Calculates the total, then divides by the size of the solution 10 | """ 11 | 12 | @classmethod 13 | def match(cls, responses, targets) -> float: 14 | """Return the aggregated Jaccard index between targets and responses.""" 15 | responses = cast_to_dict(responses) 16 | targets = cast_to_dict(targets) 17 | 18 | if not isinstance(responses, dict): 19 | return 0 20 | 21 | return 1 if responses == targets else 0 22 | 23 | 24 | class DictPrecision: 25 | 26 | @classmethod 27 | def match(cls, responses, targets) -> float: 28 | """Return the aggregated Jaccard index between targets and responses.""" 29 | responses = cast_to_dict(responses) 30 | targets = cast_to_dict(targets) 31 | 32 | if not isinstance(responses, dict): 33 | return 0 34 | 35 | if len(responses) == 0: 36 | return 0 37 | 38 | matched = 0 39 | for key, val in responses.items(): 40 | if key in targets: 41 | if ExactStrMatch.match(val, targets[key]): 42 | matched += 1 43 | 44 | return matched / len(responses) 45 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/dict_exact_match_agg_recall.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import cast_to_dict 2 | from .exact_str_match import ExactStrMatch 3 | 4 | 5 | class DictExactStrMatchAggRecall: 6 | """Calculates the exact string match across the dict. 7 | 8 | 1. Calculates the exact match for all keys in the solution 9 | 2. Calculates the total, then divides by the size of the solution 10 | """ 11 | 12 | @classmethod 13 | def match(cls, responses, targets) -> float: 14 | """Return the aggregated Jaccard index between targets and responses.""" 15 | responses = cast_to_dict(responses) 16 | targets = cast_to_dict(targets) 17 | 18 | if not isinstance(responses, dict): 19 | return 0 20 | 21 | num_keys = 0 22 | total_score = 0 23 | for key, answer in targets.items(): 24 | total_score += ExactStrMatch.match(responses.get(key), answer) 25 | num_keys += 1 26 | 27 | return total_score / num_keys 28 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/dict_jaccard_agg_jaccard.py: -------------------------------------------------------------------------------- 1 | from .jaccard import Jaccard 2 | from .common.conversions import cast_to_dict 3 | 4 | 5 | class DictJaccardAggJaccard: 6 | """Calculates the Jaccard index, dividing by the union of the predictions. 7 | 8 | 1. Calculates the Jaccard index for all sets with the same key, 9 | if it appears in either pred or targets 10 | 2. Calculates the total, then divides by the size of the union 11 | """ 12 | 13 | @classmethod 14 | def match(cls, responses, targets) -> float: 15 | """Return the aggregated Jaccard index between targets and responses.""" 16 | responses = cast_to_dict(responses) 17 | if not isinstance(responses, dict): 18 | return 0 19 | 20 | all_keys = set(responses) | set(targets) 21 | 22 | num_keys = 0 23 | total_score = 0 24 | for key in all_keys: 25 | total_score += Jaccard.match(responses.get(key, []), targets.get(key, [])) 26 | num_keys += 1 27 | 28 | return total_score / num_keys 29 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/dict_nbbox_iou_tuple_agg_jaccard.py: -------------------------------------------------------------------------------- 1 | from .nbbox_iou import NbboxIouTuple 2 | 3 | 4 | class DictNbboxIouTupleAggJaccard: 5 | """Calculates the average precision IoU across the dict. 6 | 7 | 1. Calculates the precision IoU for all sets with the same key, 8 | if it appears in either pred or targets 9 | 2. Calculates the total, then divides by the size of the union 10 | """ 11 | 12 | @classmethod 13 | def match(cls, responses, targets) -> float: 14 | """Return the aggregated Jaccard index between targets and responses.""" 15 | if not isinstance(responses, dict): 16 | return 0 17 | all_keys = set(responses) | set(targets) 18 | 19 | num_keys = 0 20 | total_score = 0 21 | for key in all_keys: 22 | total_score += NbboxIouTuple.match( 23 | responses.get(key, []), targets.get(key, []) 24 | ) 25 | num_keys += 1 26 | 27 | return total_score / num_keys 28 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/dict_set_equality_agg_jaccard.py: -------------------------------------------------------------------------------- 1 | from vlmeval.dataset.utils.megabench.scoring.set_equality import SetEquality 2 | 3 | 4 | class DictSetEqualityAggJaccard: 5 | """Calculates the average set equality across the dict. 6 | 7 | 1. Calculates the set equality for all sets with the same key, 8 | if it appears in either pred or targets 9 | 2. Calculates the total, then divides by the size of the union 10 | """ 11 | 12 | @classmethod 13 | def match(cls, responses, targets) -> float: 14 | """Return the aggregated Jaccard index between targets and responses.""" 15 | if not isinstance(responses, dict): 16 | return 0 17 | 18 | all_keys = set(responses) | set(targets) 19 | 20 | num_keys = 0 21 | total_score = 0 22 | for key in all_keys: 23 | total_score += SetEquality.match( 24 | responses.get(key, []), targets.get(key, []) 25 | ) 26 | num_keys += 1 27 | 28 | return total_score / num_keys 29 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/exact_str_match.py: -------------------------------------------------------------------------------- 1 | import re 2 | from ..parsing.common.utils import extract_code_block_content 3 | 4 | 5 | def parse_single_letter(s): 6 | # Regular expression to match (A)XXXXX, A . XXXXXXX, or A.XXXXXX 7 | match = re.match(r"^\(?([A-Za-z])\)?(?:\s*\.\s*|\.)?(.*)", s) 8 | 9 | if match: 10 | # Extract and return the single letter 11 | return match.group(1) 12 | else: 13 | # Return the original string if no match is found 14 | return s 15 | 16 | 17 | class ExactStrMatch: 18 | """Exact string matching.""" 19 | 20 | @staticmethod 21 | def match(response: str, correct_answer: str) -> int: 22 | """Exact match between targets and responses.""" 23 | if not isinstance(response, str): 24 | response = str(response) 25 | if not isinstance(correct_answer, str): 26 | correct_answer = str(correct_answer) 27 | 28 | if len(correct_answer) == 1 and correct_answer.isalpha() and len(response) > 1: 29 | # handle special case of choice letter, 30 | # drop the potential parenthesis 31 | response = parse_single_letter(response) 32 | 33 | return 1 if response == correct_answer else 0 34 | 35 | 36 | class CodeResultExactStrMatch: 37 | """Exact string matching, with the results from a results code block.""" 38 | 39 | @staticmethod 40 | def match(response: str, correct_answer: str) -> int: 41 | """Exact match between targets and responses.""" 42 | correct_answer, is_code = extract_code_block_content( 43 | correct_answer, 44 | is_ascii_art=True, 45 | should_remove_surrounding_whitespace=False, 46 | ) 47 | # assert is_code 48 | return ExactStrMatch.match(response, correct_answer) 49 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/exact_str_match_case_insensitive.py: -------------------------------------------------------------------------------- 1 | from .exact_str_match import ExactStrMatch 2 | 3 | 4 | class ExactStrMatchCaseInsensitive: 5 | """Case-insensitive exact string matching.""" 6 | 7 | @staticmethod 8 | def match(response, correct_answer) -> int: 9 | """Case-insensitive exact match between targets and responses.""" 10 | if not isinstance(response, str) and isinstance(correct_answer, str): 11 | return 0 12 | return ExactStrMatch.match(response.lower(), correct_answer.lower()) 13 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/gleu.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | import jieba 3 | from nltk.translate.gleu_score import sentence_gleu 4 | 5 | 6 | class GLEUChinese: 7 | """Compute GLEU score for Chinese text.""" 8 | 9 | @staticmethod 10 | def match(response, correct_answer) -> Number: 11 | """Compute the BLEU scores between two strings.""" 12 | if isinstance(response, str) and isinstance(correct_answer, str): 13 | reference_tokens = list(jieba.cut_for_search(response)) 14 | translation_tokens = list(jieba.cut_for_search(correct_answer)) 15 | else: 16 | return 0 17 | return sentence_gleu([reference_tokens], translation_tokens) 18 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/jaccard.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import cast_to_set 2 | from .common.metrics import jaccard_index 3 | 4 | 5 | class Jaccard: 6 | """Calculates the Jacard index for iterables.""" 7 | 8 | @classmethod 9 | def match(cls, responses, targets) -> float: 10 | """Exact match between targets and responses.""" 11 | if responses is None: 12 | return 0 13 | responses = cast_to_set(responses) 14 | targets = cast_to_set(targets) 15 | 16 | return jaccard_index(responses, targets) 17 | 18 | 19 | class JaccardCaseInsensitive: 20 | """Calculates the Jacard index for iterables of strings, 21 | Do not consider the case 22 | """ 23 | 24 | @classmethod 25 | def match(cls, responses, targets) -> float: 26 | """Exact match between targets and responses.""" 27 | if responses is None: 28 | return 0 29 | responses = cast_to_set(responses) 30 | targets = cast_to_set(targets) 31 | 32 | if isinstance(list(targets)[0], str): 33 | new_responses = { 34 | item.lower() if isinstance(item, str) else str(item).lower() 35 | for item in responses 36 | } 37 | new_targets = {item.lower() for item in targets} 38 | elif isinstance(list(targets)[0], tuple): 39 | new_responses = set() 40 | new_targets = set() 41 | try: 42 | for res in responses: 43 | new_res = tuple( 44 | [ 45 | item.lower() 46 | .replace(" ", "") 47 | .replace("-", "") 48 | .replace("\n", "") 49 | .replace("\t", "") 50 | .replace("_", "") 51 | .replace(".", "") 52 | for item in res 53 | ] 54 | ) 55 | new_responses.add(new_res) 56 | except: # the data type of the response might be wrong, return 0 in this case 57 | return 0 58 | for tgt in targets: 59 | new_tgt = tuple( 60 | [ 61 | item.lower() 62 | .replace(" ", "") 63 | .replace("-", "") 64 | .replace("\n", "") 65 | .replace("\t", "") 66 | .replace("_", "") 67 | .replace(".", "") 68 | for item in tgt 69 | ] 70 | ) 71 | new_targets.add(new_tgt) 72 | else: 73 | return 0 74 | 75 | return jaccard_index(new_responses, new_targets) 76 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/longest_common_list_prefix_ratio.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import str_to_list 2 | from .common.metrics import longest_common_prefix 3 | 4 | 5 | class LongestCommonListPrefixRatio: 6 | """Determines how much of the first part of the list 7 | was predicted correctly. 8 | """ 9 | 10 | @classmethod 11 | def match(cls, responses, targets) -> int: 12 | """Exact match between targets and responses.""" 13 | responses = str_to_list(responses) 14 | targets = str_to_list(targets) 15 | return len(longest_common_prefix(responses, targets)) / len(targets) 16 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/mse.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import numpy as np 3 | import math 4 | from .common.metrics import mse 5 | from .common.conversions import str_to_list 6 | 7 | 8 | class MSE: 9 | """Mean Squared Error.""" 10 | 11 | @staticmethod 12 | def match(response: str, correct_answer: str) -> int: 13 | """Return the mean squared error.""" 14 | try: 15 | return mse(ast.literal_eval(response), ast.literal_eval(correct_answer)) 16 | except (SyntaxError, ValueError): 17 | return 0 18 | 19 | 20 | class NormalizedRMSE: 21 | """Mean Squared Error.""" 22 | 23 | MIN = 0.0 24 | MAX = 0.1 25 | 26 | @classmethod 27 | def match(cls, response: str, correct_answer: str) -> int: 28 | """Return the mean squared error.""" 29 | try: 30 | mse_val = mse(ast.literal_eval(response), ast.literal_eval(correct_answer)) 31 | rmse = np.clip(np.sqrt(mse_val), cls.MIN, cls.MAX) 32 | norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN) 33 | return norm_rmse 34 | except (SyntaxError, ValueError): 35 | return 0 36 | 37 | 38 | class AngleSeqFloatRMSE: 39 | """Whether the sequence of numbers is close enough to the real answer.""" 40 | 41 | MIN = 0.0 42 | MAX = 10.0 43 | 44 | @classmethod 45 | def match(cls, responses, targets) -> float: 46 | """Determines whether the sequence of floats are close enough to the real answer.""" 47 | responses = str_to_list(responses) 48 | targets = str_to_list(targets) 49 | 50 | if len(responses) != len(targets): 51 | return 0 52 | 53 | try: 54 | res = np.array(responses) 55 | tgt = np.array(targets) 56 | rmse = np.sqrt(mse(res, tgt)).sum() / len(targets) 57 | except: # cannot obtain the rmse from the response, return 0 58 | return 0 59 | 60 | rmse = np.clip(rmse, cls.MIN, cls.MAX) 61 | norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN) 62 | if math.isnan(norm_rmse): 63 | return 0 64 | return norm_rmse 65 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/multi_ref_phrase.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | from .common.conversions import str_to_iterable 3 | from .simple_str_match import SimpleStrMatch 4 | 5 | 6 | def replace_potential_chinese_comma(input_string): 7 | return input_string.replace(",", ",") 8 | 9 | 10 | class MultipleReferencePhraseEval: 11 | """ 12 | Check the response with multiple correct references 13 | As long as one is matched, the score is 1, otherwise the score is 0 14 | """ 15 | 16 | @staticmethod 17 | def match(response, targets) -> Number: 18 | targets = replace_potential_chinese_comma(targets) 19 | refs = str_to_iterable(list, targets) 20 | matched = False 21 | for ref in refs: 22 | str_ref = ref if isinstance(ref, str) else str(ref) 23 | if SimpleStrMatch.match(response, str_ref): 24 | matched = True 25 | break 26 | return 1 if matched else 0 27 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/near_str_match.py: -------------------------------------------------------------------------------- 1 | import rapidfuzz 2 | import unidecode 3 | from .common.transformations import remove_def_indef_articles 4 | 5 | 6 | def approximate(text: str) -> str: 7 | """Return an approximation of the original string.""" 8 | return unidecode.unidecode(remove_def_indef_articles(text)).lower() 9 | 10 | 11 | class NearStrMatch: 12 | """Near string matching.""" 13 | 14 | @staticmethod 15 | def match(response, correct_answer: str, threshold=0.9) -> int: 16 | """Simple string match between response and correct_answer.""" 17 | if not isinstance(response, str) or not isinstance(correct_answer, str): 18 | return 0 19 | response = approximate(response) 20 | correct_answer = approximate(correct_answer) 21 | return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity( 22 | response, correct_answer, score_cutoff=threshold 23 | ) 24 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/nli_entailment.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import pipeline 3 | 4 | 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 6 | pipe = pipeline( 7 | "text-classification", model="microsoft/deberta-large-mnli", device=device 8 | ) 9 | 10 | 11 | class NliEntailment: 12 | """NLI entailment, where the correct answer is used as the premise.""" 13 | 14 | @staticmethod 15 | def match(response, correct_answer) -> int: 16 | """Return whether the response and correct answer agree with each other.""" 17 | if not isinstance(response, str) or isinstance(correct_answer, str): 18 | return 0 19 | resp = pipe(f"[CLS] {correct_answer.strip()} [SEP] {response.strip()} [SEP]") 20 | return 1 if resp[0]["label"] == "ENTAILMENT" else 0 21 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/normalized_similarity_damerau_levenshtein.py: -------------------------------------------------------------------------------- 1 | import rapidfuzz 2 | 3 | 4 | class NormalizedSimilarityDamerauLevenshtein: 5 | """Normalized Damerau-Levenshtein Similarity.""" 6 | 7 | @staticmethod 8 | def match(response, correct_answer) -> int: 9 | """Normalized indel similarityuiio do between targets and responses.""" 10 | if not isinstance(response, str) and isinstance(correct_answer, str): 11 | return 0 12 | return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity( 13 | response, correct_answer 14 | ) 15 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/number_rel_diff_ratio.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import math 3 | from numbers import Number 4 | 5 | 6 | class NumberRelDiffRatio: 7 | """Number relative difference ratio scoring = min(0, 1 - |pred - gt| / gt)""" 8 | 9 | @staticmethod 10 | def match(response: str | Number, correct_answer: str) -> int: 11 | """Return the relative difference ratio.""" 12 | try: 13 | if isinstance(response, Number): 14 | pred = response 15 | else: 16 | pred = ast.literal_eval(response) 17 | if not isinstance(pred, Number): 18 | return 0 19 | gt = ast.literal_eval(correct_answer) 20 | return max(0, 1 - math.fabs((pred - gt) / gt)) 21 | except (SyntaxError, ValueError): 22 | return 0 23 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/positive_int_match.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | 4 | class PositiveIntMatch: 5 | """Positive int matching.""" 6 | 7 | @staticmethod 8 | def match(response: str, correct_answer: str) -> int: 9 | """If the correct answer or response is a positive integer, then it returns if the predicted and correct answers are identical. 10 | 11 | Otherwise, it returns -1. 12 | """ 13 | try: 14 | response_obj = ast.literal_eval(response) 15 | except (SyntaxError, ValueError): 16 | return 0 17 | 18 | if not correct_answer: 19 | return 0 20 | 21 | correct_answer_obj = ast.literal_eval(correct_answer) 22 | 23 | assert isinstance(correct_answer_obj, int) 24 | if not isinstance(response_obj, int): 25 | return 0 26 | 27 | # We only want to score the fields with a positive amount 28 | if correct_answer_obj <= 0 and response_obj <= 0: 29 | return -1 30 | 31 | return 1 if response_obj == correct_answer_obj else 0 32 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/sacrebleu_bleu.py: -------------------------------------------------------------------------------- 1 | from numbers import Number 2 | import sacrebleu 3 | 4 | 5 | class Bleu: 6 | """Compute BLEU score, using SacreBLEU.""" 7 | 8 | @staticmethod 9 | def match(response, correct_answer) -> Number: 10 | """Compute the BLEU scores between two strings.""" 11 | if isinstance(response, str) and isinstance(correct_answer, str): 12 | resp = [response] 13 | corr = [correct_answer] 14 | elif isinstance(response, (list, tuple)) and isinstance( 15 | correct_answer, (list, tuple) 16 | ): 17 | resp = tuple(response) 18 | corr = tuple(correct_answer) 19 | else: 20 | return 0 21 | result = sacrebleu.corpus_bleu(corr, [resp]).score / 100 22 | return result 23 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/sequence_equality.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import str_to_list 2 | from numbers import Number 3 | 4 | 5 | class SequenceEquality: 6 | """Determines how much of the first part of the list 7 | was predicted correctly. 8 | """ 9 | 10 | @classmethod 11 | def match(cls, responses, targets) -> int: 12 | """Exact match between targets and responses.""" 13 | if not isinstance(responses, str): 14 | responses = str(responses) 15 | responses = str_to_list(responses) 16 | targets = str_to_list(targets) 17 | return 1 if responses == targets else 0 18 | 19 | 20 | class SequenceEqualityCaseInsensitive: 21 | """Determines how much of the first part of the list 22 | was predicted correctly. 23 | """ 24 | 25 | @classmethod 26 | def match(cls, responses, targets) -> int: 27 | """Exact match between targets and responses.""" 28 | if not isinstance(responses, str): 29 | responses = str(responses) 30 | responses = str_to_list(responses) 31 | targets = str_to_list(targets) 32 | 33 | responses = [ 34 | item.lower() if isinstance(item, str) else str(item) for item in responses 35 | ] 36 | targets = [item.lower() for item in targets] 37 | return 1 if responses == targets else 0 38 | 39 | 40 | class SequenceAccuracyCaseInsensitive: 41 | """Determines how much of the first part of the list 42 | was predicted correctly. 43 | """ 44 | 45 | @classmethod 46 | def match(cls, responses, targets) -> int: 47 | """Exact match between targets and responses.""" 48 | responses = str_to_list(responses) 49 | targets = str_to_list(targets) 50 | if len(targets) != len(responses): 51 | return 0 52 | correct = 0 53 | for res, tgt in zip(responses, targets): 54 | if isinstance(tgt, str): 55 | if res.lower() == tgt.lower(): 56 | correct += 1 57 | elif isinstance(tgt, Number) and isinstance(res, Number): 58 | if res == tgt: 59 | correct += 1 60 | else: 61 | pass 62 | return correct / len(targets) 63 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/set_equality.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import cast_to_set, str_to_set 2 | 3 | 4 | def _convert_to_hashable(item): 5 | """将不可哈希的类型转换为可哈希类型""" 6 | if isinstance(item, (list, tuple)): 7 | return tuple(item) # 将列表转换为元组 8 | return item 9 | 10 | 11 | class SetEquality: 12 | """Determines whether two sets are equal.""" 13 | 14 | @classmethod 15 | def match(cls, responses, targets) -> int: 16 | """Exact match between targets and responses.""" 17 | if isinstance(responses, (list, tuple)): 18 | responses = {_convert_to_hashable(item) for item in responses} 19 | if isinstance(targets, (list, tuple)): 20 | targets = {_convert_to_hashable(item) for item in targets} 21 | return 1 if responses == targets else 0 22 | 23 | 24 | class SetEqualityCaseInsensitive: 25 | """Determines whether two sets are equal, ignoring string case.""" 26 | 27 | @classmethod 28 | def match(cls, responses, targets) -> int: 29 | """Exact match between targets and responses.""" 30 | try: 31 | responses: set[str] = {text.upper() for text in cast_to_set(responses)} 32 | targets: set[str] = {text.upper() for text in cast_to_set(targets)} 33 | except AttributeError: 34 | return 0 35 | return 1 if responses == targets else 0 36 | 37 | 38 | class StringSetEqualityLineSplit: 39 | """Determines whether two sets are equal, for string inputs, separated by line breaks""" 40 | 41 | @classmethod 42 | def match(cls, responses, targets) -> int: 43 | if "\\n" in targets: 44 | targets = targets.replace("\\n", "\n") 45 | if "\\n" in responses: 46 | responses = responses.replace("\\n", "\n") 47 | responses_set = set(responses.split("\n")) 48 | targets_set = set(targets.split("\n")) 49 | responses_set = { 50 | item.lower() if isinstance(item, str) else item for item in responses_set 51 | } 52 | targets_set = { 53 | item.lower() if isinstance(item, str) else item for item in targets_set 54 | } 55 | return 1 if responses_set == targets_set else 0 56 | 57 | 58 | class StringSetEqualityCommaSplit: 59 | """Determines whether two sets are equal, for string inputs, separated by commas 60 | Handles some corner cases that would fail the general SetEquality metric, like the string 61 | with "None", which fails the eval. Also do case-insensitive eval. 62 | """ 63 | 64 | @classmethod 65 | def match(cls, responses, targets) -> int: 66 | responses_set = str_to_set(responses) 67 | targets_set = str_to_set(targets) 68 | responses_set = { 69 | item.lower() if isinstance(item, str) else item for item in responses_set 70 | } 71 | targets_set = { 72 | item.lower() if isinstance(item, str) else item for item in targets_set 73 | } 74 | return 1 if responses_set == targets_set else 0 75 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/set_precision.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import cast_to_set 2 | from .common.metrics import set_precision 3 | 4 | 5 | class SetPrecision: 6 | """Calculates the set precision for iterables.""" 7 | 8 | @classmethod 9 | def match(cls, responses, targets) -> float: 10 | """Exact match between targets and responses.""" 11 | if responses is None: 12 | return 0 13 | responses = cast_to_set(responses) 14 | targets = cast_to_set(targets) 15 | 16 | return set_precision(responses, targets) 17 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/simple_str_match.py: -------------------------------------------------------------------------------- 1 | from .exact_str_match import ExactStrMatch 2 | 3 | 4 | class SimpleStrMatch: 5 | """Basic string matching, without spaces or hyphens.""" 6 | 7 | @staticmethod 8 | def match(response, correct_answer: str) -> int: 9 | """Simple string match between response and correct_answer.""" 10 | if not isinstance(response, str): 11 | response = str(response) # If it is JSON-like 12 | response = ( 13 | response.replace(" ", "") 14 | .replace("-", "") 15 | .replace("\n", "") 16 | .replace("\t", "") 17 | .replace(".", "") 18 | .lower() 19 | ) 20 | correct_answer = ( 21 | correct_answer.replace(" ", "") 22 | .replace("-", "") 23 | .replace("\n", "") 24 | .replace("\t", "") 25 | .replace(".", "") 26 | .lower() 27 | ) 28 | 29 | return ExactStrMatch.match(response, correct_answer) 30 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/unsupported_scoring.py: -------------------------------------------------------------------------------- 1 | class UnsupportedScoring: 2 | """Unsupported scoring.""" 3 | 4 | @staticmethod 5 | def match(response: str, correct_answer: str) -> int: 6 | """Default response for unimplemented metrics.""" 7 | return -1 8 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/xml_nbbox_iou.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .common.metrics import calculate_iou 3 | from .common.conversions import parse_bboxes_from_xml 4 | from numbers import Number 5 | 6 | 7 | class XmlNbboxIouSingle: 8 | """Calculates the IoU of bounding box. 9 | 10 | Assumes that co-ordinates are normalized between 0 and 1 and that the bounding boxes 11 | are of the form top_left_x, top_left_y, bottom_right_x, bottom_right_y 12 | """ 13 | 14 | @classmethod 15 | def match(cls, responses, targets) -> float: 16 | 17 | logging.debug(f"{responses=}, {targets=}") 18 | if not isinstance(responses, (tuple | list)): 19 | responses = parse_bboxes_from_xml(responses) 20 | if not isinstance(targets, (tuple | list)): 21 | targets = parse_bboxes_from_xml(targets) 22 | 23 | if len(responses) == 0: 24 | return 0 25 | elif isinstance(responses[0], Number) and len(responses) == 4: 26 | responses = [responses] 27 | 28 | iou_scores = calculate_iou(responses, targets) 29 | if not iou_scores: 30 | return 0 31 | 32 | # Take the mean IoU score for now. 33 | return sum(iou_scores) / len(iou_scores) 34 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/xml_norm_point_distance.py: -------------------------------------------------------------------------------- 1 | """Return the normalized point distance.""" 2 | 3 | from .common.conversions import parse_point_2d_from_xml 4 | from .common.metrics import point_distance 5 | 6 | 7 | class XmlNormPointDistance: 8 | """Determines the distance between two points in XML notation. 9 | 10 | Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is 11 | of the form x, y. 12 | """ 13 | 14 | @classmethod 15 | def parse_2d_point(cls, point) -> tuple[float, float]: 16 | """Parse a 2D point encoded in XML as x, y.""" 17 | if not isinstance(point, (tuple | list)): 18 | point = parse_point_2d_from_xml(point) 19 | if not point: 20 | raise ValueError("Point could not be parsed from XML string.") 21 | elif len(point) != 2: 22 | raise ValueError("Point is not 2D.") 23 | if not all(0 <= comp <= 1 for comp in point): 24 | raise ValueError("Point is not normalized.") 25 | return tuple(point) 26 | 27 | @classmethod 28 | def match(cls, responses, targets) -> float: 29 | """Determine the normalized distance between two points.""" 30 | try: 31 | responses = cls.parse_2d_point(responses) 32 | targets = cls.parse_2d_point(targets) 33 | except ValueError: 34 | return 0 35 | 36 | # Instead of normalizing by 1/sqrt(2), we just set it to 0 if the distance is above 1. 37 | return max(0, 1 - point_distance(responses, targets)) 38 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/scoring/xml_norm_point_in_bbox.py: -------------------------------------------------------------------------------- 1 | from .common.conversions import parse_point_2d_from_xml, str_to_bboxes 2 | 3 | 4 | class XmlNormPointInBbox: 5 | """Determines whether a point is located in a bounding box. 6 | 7 | Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is 8 | of the form x, y 9 | """ 10 | 11 | @classmethod 12 | def match(cls, responses, eval_context) -> int: 13 | """Determine if the point is in the bounding box 14 | and return which bounding box was matched, if any.""" 15 | bounding_box_has_match = { 16 | bbox: False for bbox in eval_context["bounding_boxes"] 17 | } 18 | bounding_boxes = [ 19 | str_to_bboxes(bbox_str)[0] for bbox_str in eval_context["bounding_boxes"] 20 | ] 21 | assert bounding_boxes 22 | 23 | if not isinstance(responses, (tuple | list)): 24 | responses = parse_point_2d_from_xml(responses) 25 | if not responses: 26 | return 0, bounding_box_has_match 27 | elif len(responses) != 2: 28 | return 0, bounding_box_has_match 29 | 30 | x, y = responses 31 | for min_x, min_y, max_x, max_y in bounding_boxes: 32 | if min_x <= x <= max_x and min_y <= y <= max_y: 33 | bounding_box_has_match[str((min_x, min_y, max_x, max_y))] = True 34 | return 1, bounding_box_has_match 35 | return 0, bounding_box_has_match 36 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/megabench/utils.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from mimetypes import guess_type 3 | 4 | 5 | def lazy_import(module_name, class_name): 6 | """Import the module lazily.""" 7 | 8 | def importer(): 9 | module = importlib.import_module(module_name) 10 | return getattr(module, class_name) 11 | 12 | return importer 13 | 14 | 15 | def is_video_file(file_path): 16 | mime_type, _ = guess_type(file_path) 17 | if not mime_type: 18 | return False 19 | return mime_type.startswith("video") 20 | 21 | 22 | 23 | def prepare_megabench_data(dataset_name, dataset_subset_name): 24 | """ 25 | Prepare the MEGA-Bench dataset for evaluation. 26 | Return: 27 | subset_dataset: The organized data of the specified subset 28 | all_dataset: The organized data of all tasks, used for evaluation 29 | """ 30 | from datasets import load_dataset 31 | if "single_image" in dataset_subset_name: 32 | core_data = load_dataset(dataset_name, "core_single_image") 33 | open_data = load_dataset(dataset_name, "open_single_image") 34 | else: 35 | core_data = load_dataset(dataset_name, "core") 36 | open_data = load_dataset(dataset_name, "open") 37 | core_test_samples = list(core_data["test"]) 38 | organized_core_dataset = organize_hf_dataset(core_test_samples) 39 | open_test_samples = list(open_data["test"]) 40 | organized_open_dataset = organize_hf_dataset(open_test_samples) 41 | subset_dataset = organized_core_dataset if "core" in dataset_subset_name else organized_open_dataset 42 | all_dataset = organized_core_dataset + organized_open_dataset 43 | return subset_dataset, all_dataset 44 | 45 | 46 | def organize_hf_dataset(dataset): 47 | """ 48 | Organize the dataset with task-based manner 49 | 50 | Return: 51 | organized_dataset: list, each item is a dict, with the following keys: 52 | - task_name: str 53 | - task_query_samples: list of dicts, each dict contains the sample information 54 | """ 55 | task_dict = {} 56 | for sample in dataset: 57 | task_name = sample["task_name"] 58 | if task_name not in task_dict: 59 | task_dict[task_name] = [] 60 | task_dict[task_name].append(sample) 61 | 62 | organized_dataset = [] 63 | for task_name, samples in task_dict.items(): 64 | organized_dataset.append({ 65 | "task_name": task_name, 66 | "task_samples": samples 67 | }) 68 | 69 | return organized_dataset 70 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/mmif/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/utils/mmif/__init__.py -------------------------------------------------------------------------------- /vlmeval/dataset/utils/mmsci4eval_req.txt: -------------------------------------------------------------------------------- 1 | evaluate 2 | pycocoevalcap 3 | bert_score 4 | rouge_score 5 | nltk 6 | absl-py 7 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/ocrbench.py: -------------------------------------------------------------------------------- 1 | from ...smp import * 2 | 3 | 4 | def OCRBench_eval(eval_file): 5 | OCRBench_score = { 6 | 'Regular Text Recognition': 0, 7 | 'Irregular Text Recognition': 0, 8 | 'Artistic Text Recognition': 0, 9 | 'Handwriting Recognition': 0, 10 | 'Digit String Recognition': 0, 11 | 'Non-Semantic Text Recognition': 0, 12 | 'Scene Text-centric VQA': 0, 13 | 'Doc-oriented VQA': 0, 14 | 'Key Information Extraction': 0, 15 | 'Handwritten Mathematical Expression Recognition': 0 16 | } 17 | 18 | logger = get_logger('Evaluation') 19 | 20 | data = load(eval_file) 21 | lt = len(data) 22 | lines = [data.iloc[i] for i in range(lt)] 23 | for i in tqdm(range(len(lines))): 24 | line = lines[i] 25 | predict = str(line['prediction']) 26 | answers = eval(line['answer']) 27 | category = line['category'] 28 | if category == 'Handwritten Mathematical Expression Recognition': 29 | for j in range(len(answers)): 30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '') 31 | predict = predict.strip().replace('\n', ' ').replace(' ', '') 32 | if answer in predict: 33 | OCRBench_score[category] += 1 34 | break 35 | else: 36 | for j in range(len(answers)): 37 | answer = answers[j].lower().strip().replace('\n', ' ') 38 | predict = predict.lower().strip().replace('\n', ' ') 39 | if answer in predict: 40 | OCRBench_score[category] += 1 41 | break 42 | 43 | final_score_dict = {} 44 | final_score_dict['Text Recognition'] = ( 45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] 46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] 47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'] 48 | ) 49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] 50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] 51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] 52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \ 53 | OCRBench_score['Handwritten Mathematical Expression Recognition'] 54 | final_score_dict['Final Score'] = ( 55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] 56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] 57 | + final_score_dict['Handwritten Mathematical Expression Recognition'] 58 | ) 59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 60 | score_pth = eval_file.replace('.xlsx', '_score.json') 61 | dump(final_score_dict, score_pth) 62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 63 | logger.info('Score: ') 64 | for key, value in final_score_dict.items(): 65 | logger.info('{}:{}'.format(key, value)) 66 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/qbench_video.py: -------------------------------------------------------------------------------- 1 | from ...smp import * 2 | from .multiple_choice import extract_answer_from_item 3 | from PIL import Image, ImageOps 4 | import numpy as np 5 | 6 | FAIL_MSG = 'Failed to obtain answer via API.' 7 | 8 | VQA_JUDGE_SYS_PROMPT = """ 9 | You are a helpful assistant that grades answers related to visual video quality. 10 | There are a lot of special terms or keywords related to video processing and photography. 11 | You will pay attention to the context of `quality evaluation' when grading. 12 | """ 13 | 14 | VQA_JUDGE_USER_PROMPT = """ 15 | Given the question {}, evaluate whether the response {} completely matches the correct answer {}. 16 | First, check the response and please rate score 0 if the response is not a valid answer. 17 | Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance. 18 | Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance. 19 | Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all. 20 | Please only provide the result in the following format: Score:' 21 | """ # noqa: E501 22 | 23 | 24 | def check_ans_mcq(pred, ans, correct_choice, correct_answer): 25 | flag = False 26 | 27 | if correct_choice == pred or correct_choice + "." in pred or correct_answer == pred: 28 | flag = True 29 | elif correct_choice in pred.split("\n"): 30 | flag = True 31 | 32 | return flag 33 | 34 | 35 | def check_ans_vqa(model, line): 36 | score = model.generate(VQA_JUDGE_USER_PROMPT.format(line['question'], line['prediction'], line['answer'])).strip() 37 | return score 38 | 39 | 40 | def get_dimension_rating(score_file): 41 | score = load(score_file) 42 | result_dict = {} 43 | for idx, item in score.iterrows(): 44 | question_type = eval(item['dimensions'])[0].split(',')[0] 45 | if question_type not in result_dict: 46 | result_dict[question_type] = [0, 0] 47 | result_dict[question_type][0] += int(item['score']) 48 | result_dict[question_type][1] += 1 49 | return result_dict 50 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/utils/vgrpbench/__init__.py -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/coloredsudoku.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union, Dict, Any 2 | import random 3 | import copy 4 | import os 5 | import json 6 | 7 | from .common_puzzle_factory import PuzzleFactory 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat 9 | 10 | class ConstraintColorNoRepeat(Constraint): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | self.name = "constraint_color_no_repeat" 14 | 15 | def check(self, game_state: Dict[str, Any]) -> bool: 16 | board = game_state["board"] 17 | colors = game_state.get("colors", None) 18 | 19 | # If no colors are specified, skip this constraint 20 | if colors is None: 21 | return True 22 | 23 | color_groups = {} 24 | for i in range(len(board)): 25 | for j in range(len(board[0])): 26 | color = colors[i][j] 27 | if color not in color_groups: 28 | color_groups[color] = [] 29 | if board[i][j] != 0: 30 | color_groups[color].append(board[i][j]) 31 | for color_values in color_groups.values(): 32 | if len(set(color_values)) != len(color_values): 33 | return False 34 | return True 35 | 36 | class ColoredSudokuPuzzleFactory(PuzzleFactory): 37 | def __init__(self, size: int) -> None: 38 | super().__init__() 39 | self.game_name = "coloredsudoku" 40 | self.size = size 41 | self.constraints = [ 42 | ConstraintRowNoRepeat(), 43 | ConstraintColNoRepeat(), 44 | ConstraintColorNoRepeat() 45 | ] 46 | self.all_possible_values = [i for i in range(1, size + 1)] 47 | self.colors = [chr(65 + i) for i in range(size)] 48 | 49 | def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]: 50 | possible_values = [] 51 | board = game_state["board"] 52 | original_value = board[row][col] 53 | for value in self.all_possible_values: 54 | board[row][col] = value 55 | if self.check(game_state): 56 | possible_values.append(value) 57 | board[row][col] = original_value 58 | return possible_values 59 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/common_constriants.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | 3 | class Constraint(): 4 | def __init__(self) -> None: 5 | self.name = "" 6 | def check(self, game_state: Dict[str, Any]) -> bool: 7 | pass 8 | 9 | class ConstraintRowNoRepeat(Constraint): 10 | def __init__(self) -> None: 11 | super().__init__() 12 | self.name = "constraint_row_no_repeat" 13 | def check(self, game_state: Dict[str, Any]) -> bool: 14 | board = game_state["board"] 15 | for row in board: 16 | row_tmp = [cell for cell in row if cell != 0] 17 | if len(set(row_tmp)) != len(row_tmp): 18 | return False 19 | return True 20 | 21 | class ConstraintColNoRepeat(Constraint): 22 | def __init__(self) -> None: 23 | super().__init__() 24 | self.name = "constraint_col_no_repeat" 25 | def check(self, game_state: Dict[str, Any]) -> bool: 26 | board = game_state["board"] 27 | for col in range(len(board[0])): 28 | col_tmp = [board[row][col] for row in range(len(board)) if board[row][col] != 0] 29 | if len(set(col_tmp)) != len(col_tmp): 30 | return False 31 | return True 32 | 33 | class ConstraintSubGridNoRepeat(Constraint): 34 | def __init__(self) -> None: 35 | super().__init__() 36 | self.name = "constraint_sub_grid_no_repeat" 37 | def check(self, game_state: Dict[str, Any]) -> bool: 38 | board = game_state["board"] 39 | assert len(board) == len(board[0]), "board is not square" 40 | assert len(board) in [4, 9], "board size is not 4 or 9" 41 | 42 | sub_grid_size = int(len(board) ** 0.5) 43 | for i in range(0, len(board), sub_grid_size): 44 | for j in range(0, len(board[0]), sub_grid_size): 45 | sub_grid = [ 46 | board[x][y] for x in range(i, i + sub_grid_size) 47 | for y in range(j, j + sub_grid_size) 48 | if board[x][y] != 0 49 | ] 50 | if len(set(sub_grid)) != len(sub_grid): 51 | return False 52 | return True 53 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/common_get_game_factory.py: -------------------------------------------------------------------------------- 1 | def get_game_factory(game_type): 2 | if game_type == "sudoku": 3 | from .sudoku import SudokuPuzzleFactory as GameFactory 4 | elif game_type == "binairo": 5 | from .binairo import BinairoPuzzleFactory as GameFactory 6 | elif game_type == "coloredsudoku": 7 | from .coloredsudoku import ColoredSudokuPuzzleFactory as GameFactory 8 | elif game_type == "kakuro": 9 | from .kakuro import KakuroPuzzleFactory as GameFactory 10 | elif game_type == "killersudoku": 11 | from .killersudoku import KillerSudokuPuzzleFactory as GameFactory 12 | elif game_type == "renzoku": 13 | from .renzoku import RenzokuPuzzleFactory as GameFactory 14 | elif game_type == "skyscraper": 15 | from .skyscraper import SkyscraperPuzzleFactory as GameFactory 16 | elif game_type == "starbattle": 17 | from .starbattle import StarBattlePuzzleFactory as GameFactory 18 | elif game_type == "treesandtents": 19 | from .treesandtents import TreesAndTentsPuzzleFactory as GameFactory 20 | elif game_type == "thermometers": 21 | from .thermometers import ThermometersPuzzleFactory as GameFactory 22 | elif game_type == "futoshiki": 23 | from .futoshiki import FutoshikiPuzzleFactory as GameFactory 24 | elif game_type == "hitori": 25 | from .hitori import HitoriPuzzleFactory as GameFactory 26 | elif game_type == "aquarium": 27 | from .aquarium import AquariumPuzzleFactory as GameFactory 28 | elif game_type == "kakurasu": 29 | from .kakurasu import KakurasuPuzzleFactory as GameFactory 30 | elif game_type == "oddevensudoku": 31 | from .oddevensudoku import OddEvenSudokuPuzzleFactory as GameFactory 32 | elif game_type == "battleships": 33 | from .battleships import BattleshipsPuzzleFactory as GameFactory 34 | elif game_type == "fieldexplore": 35 | from .fieldexplore import FieldExplorePuzzleFactory as GameFactory 36 | elif game_type == "jigsawsudoku": 37 | from .jigsawsudoku import JigsawSudokuPuzzleFactory as GameFactory 38 | elif game_type == "lightup": 39 | from .lightup import LightUpPuzzleFactory as GameFactory 40 | elif game_type == "nonogram": 41 | from .nonogram import NonogramPuzzleFactory as GameFactory 42 | 43 | return GameFactory 44 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/common_get_prompt.py: -------------------------------------------------------------------------------- 1 | def get_prompt(game_type: str, thinking_format: str) -> str: 2 | if game_type == "sudoku": 3 | from puzzles.sudoku import SYSTEM_PROMPT 4 | elif game_type == "coloredsudoku": 5 | from puzzles.coloredsudoku import SYSTEM_PROMPT 6 | elif game_type == "binairo": 7 | from puzzles.binairo import SYSTEM_PROMPT 8 | elif game_type == "futoshiki": 9 | from puzzles.futoshiki import SYSTEM_PROMPT 10 | elif game_type == "hitori": 11 | from puzzles.hitori import SYSTEM_PROMPT 12 | elif game_type == "kakuro": 13 | from puzzles.kakuro import SYSTEM_PROMPT 14 | elif game_type == "killersudoku": 15 | from puzzles.killersudoku import SYSTEM_PROMPT 16 | elif game_type == "renzoku": 17 | from puzzles.renzoku import SYSTEM_PROMPT 18 | elif game_type == "skyscraper": 19 | from puzzles.skyscraper import SYSTEM_PROMPT 20 | elif game_type == "starbattle": 21 | from puzzles.starbattle import SYSTEM_PROMPT 22 | elif game_type == "sudoku": 23 | from puzzles.sudoku import SYSTEM_PROMPT 24 | elif game_type == "treesandtents": 25 | from puzzles.treesandtents import SYSTEM_PROMPT 26 | elif game_type == "thermometers": 27 | from puzzles.thermometers import SYSTEM_PROMPT 28 | elif game_type == "kakurasu": 29 | from puzzles.kakurasu import SYSTEM_PROMPT 30 | elif game_type == "aquarium": 31 | from puzzles.aquarium import SYSTEM_PROMPT 32 | elif game_type == "oddevensudoku": 33 | from puzzles.oddevensudoku import SYSTEM_PROMPT 34 | 35 | elif game_type == "battleships": 36 | from puzzles.battleships import SYSTEM_PROMPT 37 | elif game_type == "fieldexplore": 38 | from puzzles.fieldexplore import SYSTEM_PROMPT 39 | elif game_type == "jigsawsudoku": 40 | from puzzles.jigsawsudoku import SYSTEM_PROMPT 41 | elif game_type == "nonogram": 42 | from puzzles.nonogram import SYSTEM_PROMPT 43 | elif game_type == "lightup": 44 | from puzzles.lightup import SYSTEM_PROMPT 45 | 46 | else: 47 | raise ValueError(f"Unknown game type: {game_type}") 48 | 49 | if thinking_format == "direct_solution": 50 | return SYSTEM_PROMPT["direct_solution"] 51 | else: 52 | return SYSTEM_PROMPT["cot"] 53 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/fieldexplore.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Dict, Any, Tuple 3 | import random 4 | import copy 5 | import os 6 | import argparse 7 | 8 | from .common_puzzle_factory import PuzzleFactory 9 | from .common_constriants import Constraint 10 | 11 | class ConstraintAdjacentNumbers(Constraint): 12 | """Ensures revealed numbers match adjacent mine counts""" 13 | def check(self, game_state: List[List[Any]]) -> bool: 14 | 15 | board = game_state["board"] 16 | 17 | size = len(board) 18 | for i in range(size): 19 | for j in range(size): 20 | if isinstance(board[i][j], int) and board[i][j] != 0: # If cell is a revealed number 21 | # Count adjacent mines and undefined cells 22 | i_start = max(0, i-1) 23 | i_end = min(size, i+2) 24 | j_start = max(0, j-1) 25 | j_end = min(size, j+2) 26 | 27 | adjacent_mines = sum(1 for r in range(i_start, i_end) 28 | for c in range(j_start, j_end) 29 | if board[r][c] == 's') 30 | 31 | adjacent_undefined = sum(1 for r in range(i_start, i_end) 32 | for c in range(j_start, j_end) 33 | if board[r][c] == 0) 34 | 35 | # Check if current mines <= number <= potential mines (current + undefined) 36 | if adjacent_mines > board[i][j] or adjacent_mines + adjacent_undefined < board[i][j]: 37 | return False 38 | return True 39 | 40 | class FieldExplorePuzzleFactory(PuzzleFactory): 41 | def __init__(self, size: int) -> None: 42 | super().__init__() 43 | self.game_name = "fieldexplore" 44 | self.size = size 45 | self.constraints = [ConstraintAdjacentNumbers()] 46 | self.all_possible_values = ['s', 'e'] # True for 's', False for 'e' 47 | 48 | def check(self, board: List[List[Any]]) -> bool: 49 | for constraint in self.constraints: 50 | if not constraint.check(board): 51 | return False 52 | return True 53 | 54 | def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]: 55 | possible_values = [] 56 | board = game_state["board"] 57 | original_value = board[row][col] 58 | for value in self.all_possible_values: 59 | board[row][col] = value 60 | if self.check(game_state): 61 | possible_values.append(value) 62 | board[row][col] = original_value 63 | return possible_values 64 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/jigsawsudoku.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union, Dict, Any 2 | import random 3 | import copy 4 | import os 5 | import json 6 | 7 | from .common_puzzle_factory import PuzzleFactory 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat 9 | 10 | class ConstraintRegionNoRepeat(Constraint): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | self.name = "constraint_region_no_repeat" 14 | 15 | def check(self, game_state: Dict[str, Any]) -> bool: 16 | board = game_state["board"] 17 | regions = game_state.get("regions", None) 18 | 19 | if regions is None: 20 | return True 21 | 22 | region_groups = {} 23 | for i in range(len(board)): 24 | for j in range(len(board[0])): 25 | region = regions[i][j] 26 | if region not in region_groups: 27 | region_groups[region] = [] 28 | if board[i][j] != 0: 29 | region_groups[region].append(board[i][j]) 30 | for region_values in region_groups.values(): 31 | if len(set(region_values)) != len(region_values): 32 | return False 33 | return True 34 | 35 | class JigsawSudokuPuzzleFactory(PuzzleFactory): 36 | def __init__(self, size: int) -> None: 37 | super().__init__() 38 | self.game_name = "jigsawsudoku" 39 | self.size = size 40 | self.constraints = [ 41 | ConstraintRowNoRepeat(), 42 | ConstraintColNoRepeat(), 43 | ConstraintRegionNoRepeat() 44 | ] 45 | self.all_possible_values = [i for i in range(1, size + 1)] 46 | self.cached_region_splits = [] 47 | 48 | def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]: 49 | """Get possible values for a cell based on row, column, and region constraints.""" 50 | if game_state["board"][row][col] != 0: 51 | return [] 52 | possible_values = [] 53 | for value in self.all_possible_values: 54 | # Try the value 55 | original_value = game_state["board"][row][col] 56 | game_state["board"][row][col] = value 57 | # Check if it's valid according to all constraints 58 | valid = True 59 | for constraint in self.constraints: 60 | if not constraint.check(game_state): 61 | valid = False 62 | break 63 | 64 | # Restore original value 65 | game_state["board"][row][col] = original_value 66 | 67 | if valid: 68 | possible_values.append(value) 69 | 70 | return possible_values 71 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/killersudoku.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any, Tuple 2 | import random 3 | import copy 4 | import os 5 | import argparse 6 | 7 | from .common_puzzle_factory import PuzzleFactory 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat 9 | 10 | class ConstraintCageSum(Constraint): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | self.name = "constraint_cage_sum" 14 | 15 | def check(self, game_state: Dict[str, Any]) -> bool: 16 | board = game_state["board"] 17 | cages = game_state.get("cages", []) # Default to empty list if no cages 18 | 19 | for cage in cages: 20 | cells = cage["cells"] 21 | target_sum = cage["sum"] 22 | current_sum = 0 23 | for row, col in cells: 24 | if board[row][col] == 0: # Skip empty cells 25 | continue 26 | current_sum += board[row][col] 27 | if current_sum > target_sum: # Can't exceed target sum 28 | return False 29 | # Only check equality if all cells in cage are filled 30 | if all(board[row][col] != 0 for row, col in cells) and current_sum != target_sum: 31 | return False 32 | return True 33 | 34 | class KillerSudokuPuzzleFactory(PuzzleFactory): 35 | def __init__(self, size: int) -> None: 36 | super().__init__() 37 | self.game_name = "killersudoku" 38 | self.size = size 39 | self.constraints = [ 40 | ConstraintRowNoRepeat(), 41 | ConstraintColNoRepeat(), 42 | ConstraintSubGridNoRepeat(), 43 | ConstraintCageSum() 44 | ] 45 | self.all_possible_values = [i for i in range(1, size + 1)] 46 | 47 | def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]: 48 | possible_values = [] 49 | board = game_state["board"] 50 | original_value = board[row][col] 51 | 52 | # Ensure cages exist in game_state 53 | if "cages" not in game_state: 54 | game_state["cages"] = [] 55 | 56 | for value in self.all_possible_values: 57 | board[row][col] = value 58 | if self.check(game_state): 59 | possible_values.append(value) 60 | board[row][col] = original_value 61 | return possible_values 62 | -------------------------------------------------------------------------------- /vlmeval/dataset/utils/vgrpbench/puzzles/sudoku.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union, Dict, Any 2 | import random 3 | import copy 4 | from abc import ABC, abstractmethod 5 | import os 6 | import json 7 | import argparse 8 | 9 | from .common_puzzle_factory import PuzzleFactory 10 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat 11 | 12 | class SudokuPuzzleFactory(PuzzleFactory): 13 | def __init__(self, size: int) -> None: 14 | super().__init__() 15 | self.game_name = "sudoku" 16 | self.size = size 17 | 18 | self.constraints.append(ConstraintRowNoRepeat()) 19 | self.constraints.append(ConstraintColNoRepeat()) 20 | self.constraints.append(ConstraintSubGridNoRepeat()) 21 | 22 | self.all_possible_values = [i for i in range(1, size + 1)] 23 | def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]: 24 | possible_values = [] 25 | board = game_state["board"] 26 | original_value = board[row][col] 27 | for value in self.all_possible_values: 28 | board[row][col] = value 29 | if self.check(game_state): 30 | possible_values.append(value) 31 | board[row][col] = original_value 32 | return possible_values 33 | -------------------------------------------------------------------------------- /vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.basicConfig( 3 | format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s', 4 | datefmt='%Y-%m-%d %H:%M:%S') 5 | 6 | logger_initialized = {} 7 | 8 | 9 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 10 | logger = logging.getLogger(name) 11 | if name in logger_initialized: 12 | return logger 13 | 14 | for logger_name in logger_initialized: 15 | if name.startswith(logger_name): 16 | return logger 17 | 18 | stream_handler = logging.StreamHandler() 19 | handlers = [stream_handler] 20 | 21 | try: 22 | import torch.distributed as dist 23 | if dist.is_available() and dist.is_initialized(): 24 | rank = dist.get_rank() 25 | else: 26 | rank = 0 27 | except ImportError: 28 | rank = 0 29 | 30 | if rank == 0 and log_file is not None: 31 | file_handler = logging.FileHandler(log_file, file_mode) 32 | handlers.append(file_handler) 33 | 34 | formatter = logging.Formatter( 35 | '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s', 36 | datefmt='%Y-%m-%d %H:%M:%S') 37 | for handler in handlers: 38 | handler.setFormatter(formatter) 39 | handler.setLevel(log_level) 40 | logger.propagate = False 41 | logger.addHandler(handler) 42 | 43 | if rank == 0: 44 | logger.setLevel(log_level) 45 | else: 46 | logger.setLevel(logging.ERROR) 47 | 48 | logger_initialized[name] = True 49 | return logger 50 | -------------------------------------------------------------------------------- /vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text, can_infer_sequence, can_infer_lego 2 | from .mp_util import track_progress_rich 3 | 4 | 5 | __all__ = [ 6 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 'can_infer_sequence', 'can_infer_lego', 7 | ] 8 | -------------------------------------------------------------------------------- /vlmeval/utils/mp_util.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import os 3 | from typing import Callable, Iterable, Sized 4 | 5 | from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task, 6 | TaskProgressColumn, TextColumn, TimeRemainingColumn) 7 | from rich.text import Text 8 | import os.path as osp 9 | import time 10 | import portalocker 11 | from ..smp import load, dump 12 | 13 | 14 | def track_progress_rich( 15 | func: Callable, 16 | tasks: Iterable = tuple(), 17 | nproc: int = 1, 18 | save=None, 19 | keys=None, 20 | **kwargs) -> list: 21 | 22 | from concurrent.futures import ThreadPoolExecutor 23 | from tqdm import tqdm 24 | if save is not None: 25 | assert osp.exists(osp.dirname(save)) or osp.dirname(save) == '' 26 | if not osp.exists(save): 27 | dump({}, save) 28 | if keys is not None: 29 | assert len(keys) == len(tasks) 30 | if not callable(func): 31 | raise TypeError('func must be a callable object') 32 | if not isinstance(tasks, Iterable): 33 | raise TypeError( 34 | f'tasks must be an iterable object, but got {type(tasks)}') 35 | assert nproc > 0, 'nproc must be a positive number' 36 | res = load(save) if save is not None else {} 37 | results = [None for _ in range(len(tasks))] 38 | 39 | with ThreadPoolExecutor(max_workers=nproc) as executor: 40 | futures = [] 41 | 42 | for inputs in tasks: 43 | if not isinstance(inputs, (tuple, list, dict)): 44 | inputs = (inputs, ) 45 | if isinstance(inputs, dict): 46 | future = executor.submit(func, **inputs) 47 | else: 48 | future = executor.submit(func, *inputs) 49 | futures.append(future) 50 | 51 | unfinished = set(range(len(tasks))) 52 | pbar = tqdm(total=len(unfinished)) 53 | while len(unfinished): 54 | new_finished = set() 55 | for idx in unfinished: 56 | if futures[idx].done(): 57 | results[idx] = futures[idx].result() 58 | new_finished.add(idx) 59 | if keys is not None: 60 | res[keys[idx]] = results[idx] 61 | if len(new_finished): 62 | if save is not None: 63 | dump(res, save) 64 | pbar.update(len(new_finished)) 65 | for k in new_finished: 66 | unfinished.remove(k) 67 | time.sleep(0.1) 68 | pbar.close() 69 | 70 | if save is not None: 71 | dump(res, save) 72 | return results 73 | -------------------------------------------------------------------------------- /vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .aria import Aria 6 | from .base import BaseModel 7 | from .cogvlm import CogVlm, GLM4v 8 | from .emu import Emu, Emu3_chat, Emu3_gen 9 | from .eagle_x import Eagle 10 | from .idefics import IDEFICS, IDEFICS2 11 | from .instructblip import InstructBLIP 12 | from .kosmos import Kosmos2 13 | from .llava import ( 14 | LLaVA, 15 | LLaVA_Next, 16 | LLaVA_XTuner, 17 | LLaVA_Next2, 18 | LLaVA_OneVision, 19 | LLaVA_OneVision_HF, 20 | ) 21 | from .vita import VITA, VITAQwen2 22 | from .long_vita import LongVITA 23 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6 24 | from .minigpt4 import MiniGPT4 25 | from .mmalaya import MMAlaya, MMAlaya2 26 | from .monkey import Monkey, MonkeyChat 27 | from .moondream import Moondream1, Moondream2 28 | from .minimonkey import MiniMonkey 29 | from .mplug_owl2 import mPLUG_Owl2 30 | from .omnilmm import OmniLMM12B 31 | from .open_flamingo import OpenFlamingo 32 | from .pandagpt import PandaGPT 33 | from .qwen_vl import QwenVL, QwenVLChat 34 | from .qwen2_vl import Qwen2VLChat, Qwen2VLChatAguvis 35 | from .transcore_m import TransCoreM 36 | from .visualglm import VisualGLM 37 | from .xcomposer import ( 38 | ShareCaptioner, 39 | XComposer, 40 | XComposer2, 41 | XComposer2_4KHD, 42 | XComposer2d5, 43 | ) 44 | from .yi_vl import Yi_VL 45 | from .internvl import InternVLChat 46 | from .deepseek_vl import DeepSeekVL 47 | from .deepseek_vl2 import DeepSeekVL2 48 | from .janus import Janus 49 | from .mgm import Mini_Gemini 50 | from .bunnyllama3 import BunnyLLama3 51 | from .vxverse import VXVERSE 52 | from .gemma import PaliGemma, Gemma3 53 | from .qh_360vl import QH_360VL 54 | from .phi3_vision import Phi3Vision, Phi3_5Vision 55 | from .phi4_multimodal import Phi4Multimodal 56 | from .wemm import WeMM 57 | from .cambrian import Cambrian 58 | from .chameleon import Chameleon 59 | from .video_llm import ( 60 | VideoLLaVA, 61 | VideoLLaVA_HF, 62 | Chatunivi, 63 | VideoChatGPT, 64 | LLaMAVID, 65 | VideoChat2_HD, 66 | PLLaVA, 67 | ) 68 | from .vila import VILA, NVILA 69 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2 70 | from .mantis import Mantis 71 | from .mixsense import LLama3Mixsense 72 | from .parrot import Parrot 73 | from .omchat import OmChat 74 | from .rbdash import RBDash 75 | from .xgen_mm import XGenMM 76 | from .slime import SliME 77 | from .mplug_owl3 import mPLUG_Owl3 78 | from .pixtral import Pixtral 79 | from .llama_vision import llama_vision 80 | from .llama4 import llama4 81 | from .molmo import molmo 82 | from .points import POINTS, POINTSV15 83 | from .nvlm import NVLM 84 | from .vintern_chat import VinternChat 85 | from .h2ovl_mississippi import H2OVLChat 86 | from .falcon_vlm import Falcon2VLM 87 | from .smolvlm import SmolVLM, SmolVLM2 88 | from .sail_vl import SailVL 89 | from .valley import Valley2Chat 90 | from .ross import Ross 91 | from .ola import Ola 92 | from .ursa import UrsaChat 93 | from .vlm_r1 import VLMR1Chat 94 | from .aki import AKI 95 | from .ristretto import Ristretto 96 | from .vlaa_thinker import VLAAThinkerChat 97 | from .kimi_vl import KimiVL 98 | from .wethink_vl import WeThinkVL 99 | from .flash_vl import FlashVL 100 | -------------------------------------------------------------------------------- /vlmeval/vlm/chameleon.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import warnings 3 | from .base import BaseModel 4 | from ..smp import * 5 | from PIL import Image 6 | import torch 7 | 8 | 9 | class Chameleon(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, model_path='facebook/chameleon-7b', **kwargs): 15 | try: 16 | from transformers import ChameleonProcessor, ChameleonForConditionalGeneration 17 | except Exception as e: 18 | logging.critical('Please install the latest transformers.') 19 | raise e 20 | 21 | processor = ChameleonProcessor.from_pretrained(model_path) 22 | model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16) 23 | 24 | self.model = model.cuda().eval() 25 | self.processor = processor 26 | 27 | def generate_inner(self, message, dataset=None): 28 | content, images = '', [] 29 | for x in message: 30 | if x['type'] == 'text': 31 | content += x['value'] 32 | elif x['type'] == 'image': 33 | content += '\n' 34 | images.append(Image.open(x['value'])) 35 | 36 | inputs = self.processor( 37 | text=[content], 38 | images=images, 39 | padding=True, 40 | return_tensors='pt' 41 | ).to(device='cuda', dtype=torch.bfloat16) 42 | generate_ids = self.model.generate(**inputs, max_new_tokens=2048) 43 | input_token_len = inputs.input_ids.shape[1] 44 | text = self.processor.batch_decode( 45 | generate_ids[:, input_token_len:], 46 | skip_special_tokens=True, 47 | clean_up_tokenization_spaces=False 48 | )[0] 49 | return text 50 | -------------------------------------------------------------------------------- /vlmeval/vlm/falcon_vlm.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import requests 3 | 4 | from .base import BaseModel 5 | 6 | 7 | class Falcon2VLM(BaseModel): 8 | 9 | INSTALL_REQ = False 10 | INTERLEAVE = False 11 | 12 | def __init__(self, model_path='tiiuae/falcon-11B-vlm', **kwargs): 13 | import torch 14 | from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor 15 | 16 | self.model_path = model_path 17 | self.processor = LlavaNextProcessor.from_pretrained(model_path, tokenizer_class='PreTrainedTokenizerFast') 18 | self.model = LlavaNextForConditionalGeneration.from_pretrained( 19 | model_path, torch_dtype=torch.bfloat16, device_map='cuda').eval() 20 | default_kwargs = {'max_new_tokens': 512} 21 | default_kwargs.update(kwargs) 22 | self.kwargs = default_kwargs 23 | 24 | def generate_inner(self, message, dataset=None): 25 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 26 | image = Image.open(image_path).convert('RGB') 27 | 28 | prompt = f'User:\n{prompt} Falcon:' 29 | inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda') 30 | 31 | output = self.model.generate(**inputs, **self.kwargs) 32 | prompt_length = inputs['input_ids'].shape[1] 33 | model_response = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip() 34 | return model_response 35 | -------------------------------------------------------------------------------- /vlmeval/vlm/instructblip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import os.path as osp 4 | import sys 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class InstructBLIP(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name): 15 | self.config_map = { 16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml', 17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml', 18 | } 19 | 20 | self.file_path = __file__ 21 | config_root = osp.dirname(self.file_path) 22 | 23 | try: 24 | from lavis.models import load_preprocess 25 | from omegaconf import OmegaConf 26 | from lavis.common.registry import registry 27 | except Exception as e: 28 | logging.critical('Please install lavis before using InstructBLIP. ') 29 | raise e 30 | 31 | assert name in self.config_map 32 | cfg_path = osp.join(config_root, self.config_map[name]) 33 | cfg = OmegaConf.load(cfg_path) 34 | 35 | model_cfg = cfg.model 36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2 37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct') 38 | model = model_cls.from_config(model_cfg) 39 | model.eval() 40 | 41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 42 | device = self.device 43 | model.to(device) 44 | self.model = model 45 | self.kwargs = {'max_length': 512} 46 | 47 | preprocess_cfg = cfg.preprocess 48 | vis_processors, _ = load_preprocess(preprocess_cfg) 49 | self.vis_processors = vis_processors 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 53 | vis_processors = self.vis_processors 54 | raw_image = Image.open(image_path).convert('RGB') 55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device) 56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt)) 57 | return outputs[0] 58 | -------------------------------------------------------------------------------- /vlmeval/vlm/internvl/__init__.py: -------------------------------------------------------------------------------- 1 | from .internvl_chat import InternVLChat 2 | 3 | __all__ = ['InternVLChat'] 4 | -------------------------------------------------------------------------------- /vlmeval/vlm/internvl/gui_template.yaml: -------------------------------------------------------------------------------- 1 | ScreenSpot: 2 | template_zeroshot: |- 3 | Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task} 4 | template: |- 5 | {task} 6 | placeholders: 7 | - task 8 | 9 | ScreenSpot_Pro: 10 | template_zeroshot: |- 11 | Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task} 12 | template: |- 13 | {task} 14 | placeholders: 15 | - task 16 | 17 | ScreenSpot_v2: 18 | template_zeroshot: |- 19 | Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task} 20 | template: |- 21 | {task} 22 | placeholders: 23 | - task 24 | 25 | MM_Mind2Web: 26 | system_prompt: |- 27 | You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. 28 | 29 | You have access to the following functions: 30 | - {"name": "mobile.swipe", "description": "swipe on the screen", "parameters": {"type": "object", "properties": {"from_coord": {"type": "array", "items": {"type": "number"}, "description": "The starting coordinates of the swipe"}, "to_coord": {"type": "array", "items": {"type": "number"}, "description": "The ending coordinates of the swipe"}}, "required": ["from_coord", "to_coord"]}} 31 | - {"name": "mobile.home", "description": "Press the home button"} 32 | - {"name": "mobile.back", "description": "Press the back button"} 33 | 34 | template: |- 35 | Please generate the next move according to the ui screenshot, instruction and previous actions. 36 | 37 | Instruction: 38 | {task}. 39 | 40 | Previous actions: 41 | {history}. 42 | 43 | placeholders: 44 | - task 45 | - history 46 | -------------------------------------------------------------------------------- /vlmeval/vlm/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF 2 | from .llava_xtuner import LLaVA_XTuner 3 | 4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF'] 5 | -------------------------------------------------------------------------------- /vlmeval/vlm/minigpt4.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from transformers import StoppingCriteriaList 6 | from .base import BaseModel 7 | 8 | 9 | class MiniGPT4(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, 15 | mode='v2', 16 | root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/', 17 | temperature=1, 18 | max_out_len=512): 19 | 20 | if root is None: 21 | warnings.warn( 22 | 'Please set root to the directory of MiniGPT-4, which is cloned from here: ' 23 | 'https://github.com/Vision-CAIR/MiniGPT-4. ' 24 | ) 25 | 26 | if mode == 'v2': 27 | cfg = 'minigptv2_eval.yaml' 28 | elif mode == 'v1_7b': 29 | cfg = 'minigpt4_7b_eval.yaml' 30 | elif mode == 'v1_13b': 31 | cfg = 'minigpt4_13b_eval.yaml' 32 | else: 33 | raise NotImplementedError 34 | 35 | self.mode = mode 36 | self.temperature = temperature 37 | self.max_out_len = max_out_len 38 | self.root = root 39 | this_dir = osp.dirname(__file__) 40 | 41 | self.cfg = osp.join(this_dir, 'misc', cfg) 42 | sys.path.append(self.root) 43 | 44 | from omegaconf import OmegaConf 45 | from minigpt4.common.registry import registry 46 | from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2 47 | 48 | device = torch.cuda.current_device() 49 | self.device = device 50 | 51 | cfg_path = self.cfg 52 | cfg = OmegaConf.load(cfg_path) 53 | 54 | model_cfg = cfg.model 55 | model_cfg.device_8bit = device 56 | model_cls = registry.get_model_class(model_cfg.arch) 57 | model = model_cls.from_config(model_cfg) 58 | model = model.to(device) 59 | model.eval() 60 | vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train 61 | vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) 62 | self.model = model 63 | self.vis_processor = vis_processor 64 | 65 | self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0 66 | stop_words_ids = [[835], [2277, 29937]] 67 | stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids] 68 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 69 | 70 | def generate_inner(self, message, dataset=None): 71 | from minigpt4.conversation.conversation import Chat 72 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 73 | if self.mode == 'v2': 74 | chat = Chat(self.model, self.vis_processor, device=self.device) 75 | else: 76 | chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria) 77 | 78 | chat_state = self.CONV_VISION.copy() 79 | img_list = [] 80 | _ = chat.upload_img(image_path, chat_state, img_list) 81 | chat.encode_img(img_list) 82 | chat.ask(prompt, chat_state) 83 | with torch.inference_mode(): 84 | msg = chat.answer(conv=chat_state, img_list=img_list)[0] 85 | return msg 86 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigpt4_13b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-13b-v0" 25 | 26 | datasets: 27 | cc_sbu_align: 28 | vis_processor: 29 | train: 30 | name: "blip2_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | 36 | run: 37 | task: image_text_pretrain 38 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigpt4_7b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-7b-v0" 25 | 26 | 27 | datasets: 28 | cc_sbu_align: 29 | vis_processor: 30 | train: 31 | name: "blip2_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | 37 | run: 38 | task: image_text_pretrain 39 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigptv2_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt_v2 3 | model_type: pretrain 4 | max_txt_len: 160 5 | end_sym: "" 6 | low_resource: True 7 | prompt_template: '[INST] {} [/INST]' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | lora_r: 64 10 | lora_alpha: 16 11 | 12 | # vit encoder 13 | image_size: 448 14 | drop_path_rate: 0 15 | use_grad_checkpoint: False 16 | vit_precision: "fp16" 17 | freeze_vit: True 18 | 19 | # generation configs 20 | prompt: "" 21 | 22 | # LLM 23 | llama_model: "please set this value to the path of llama2-chat-7b" 24 | 25 | datasets: 26 | cc_sbu_align: 27 | vis_processor: 28 | train: 29 | name: "blip2_image_eval" 30 | image_size: 448 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | 35 | run: 36 | task: image_text_pretrain 37 | -------------------------------------------------------------------------------- /vlmeval/vlm/mixsense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | from .base import BaseModel 8 | from ..smp import * 9 | 10 | 11 | class LLama3Mixsense(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs): 17 | assert model_path is not None 18 | transformers.logging.set_verbosity_error() 19 | transformers.logging.disable_progress_bar() 20 | warnings.filterwarnings('ignore') 21 | self.tokenizer = AutoTokenizer.from_pretrained( 22 | model_path, trust_remote_code=True 23 | ) 24 | self.model = AutoModelForCausalLM.from_pretrained( 25 | model_path, trust_remote_code=True, device_map='auto' 26 | ).eval() 27 | self.kwargs = kwargs 28 | 29 | def generate_inner(self, message, dataset=None): 30 | prompt, image_path = self.message_to_promptimg(message) 31 | input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda') 32 | image = Image.open(image_path).convert('RGB') 33 | image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda') 34 | # generate 35 | with torch.inference_mode(): 36 | output_ids = self.model.generate( 37 | input_ids, 38 | images=image_tensor, 39 | max_new_tokens=2048, 40 | use_cache=True, 41 | eos_token_id=[ 42 | self.tokenizer.eos_token_id, 43 | self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0], 44 | ], 45 | ) 46 | return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 47 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/__init__.py: -------------------------------------------------------------------------------- 1 | from .ola_model import Ola 2 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/arguments.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | from dataclasses import dataclass, field 4 | from typing import Optional 5 | 6 | 7 | @dataclass 8 | class ModelArguments: 9 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m") 10 | version: Optional[str] = field(default="v0") 11 | freeze_backbone: bool = field(default=False) 12 | tune_speech_projector: bool = field(default=False) 13 | tune_speech_encoder: bool = field(default=False) 14 | tune_speech_generator_only: bool = field(default=False) 15 | speech_encoder_type: Optional[str] = field(default=None) 16 | speech_encoder: Optional[str] = field(default=None) 17 | pretrain_speech_projector: Optional[str] = field(default=None) 18 | speech_projector_type: Optional[str] = field(default='linear') 19 | speech_encoder_ds_rate: int = 5 20 | speech_encoder_hidden_size: int = 1280 21 | 22 | 23 | @dataclass 24 | class DataArguments: 25 | data_path: str = field(default=None, 26 | metadata={"help": "Path to the training data."}) 27 | is_multimodal: bool = False 28 | input_type: str = field(default="mel") 29 | speech_normalize: bool = False 30 | mel_size: int = 128 31 | has_tgt_units: bool = False 32 | 33 | 34 | @dataclass 35 | class TrainingArguments(transformers.TrainingArguments): 36 | cache_dir: Optional[str] = field(default=None) 37 | optim: str = field(default="adamw_torch") 38 | freeze_speech_projector: bool = field(default=False) 39 | model_max_length: int = field( 40 | default=512, 41 | metadata={ 42 | "help": 43 | "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 44 | }, 45 | ) 46 | double_quant: bool = field( 47 | default=True, 48 | metadata={"help": "Compress the quantization statistics through double quantization."} 49 | ) 50 | quant_type: str = field( 51 | default="nf4", 52 | metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} 53 | ) 54 | bits: int = field( 55 | default=16, 56 | metadata={"help": "How many bits to use."} 57 | ) 58 | lora_enable: bool = False 59 | lora_r: int = 64 60 | lora_alpha: int = 16 61 | lora_dropout: float = 0.05 62 | lora_weight_path: str = "" 63 | lora_bias: str = "none" 64 | speech_projector_lr: Optional[float] = None 65 | group_by_modality_length: bool = field(default=False) 66 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | SPEECH_TOKEN_INDEX = -200 9 | DEFAULT_SPEECH_TOKEN = "" 10 | IMAGE_TOKEN_INDEX= -300 11 | DEFAULT_IMAGE_TOKEN = "" 12 | DEFAULT_IMAGE_PATCH_TOKEN = "" 13 | DEFAULT_IM_START_TOKEN = "" 14 | DEFAULT_IM_END_TOKEN = "" 15 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ola/ola/datasets/__init__.py -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.ola_qwen import OlaQwenForCausalLM, OlaConfigQwen 2 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .oryx_vit import SigLIPViTAnysizeWrapper 3 | 4 | def build_vision_tower(vision_tower_cfg, **kwargs): 5 | vision_tower = getattr(vision_tower_cfg, 'vision_tower', getattr(vision_tower_cfg, 'mm_vision_tower', None)) 6 | is_absolute_path_exists = os.path.exists(vision_tower) 7 | print(f"Buiding OryxViTWrapper from {vision_tower}...") 8 | # path = vision_tower.split(":")[1] 9 | return SigLIPViTAnysizeWrapper(vision_tower, path=vision_tower, args=vision_tower_cfg, **kwargs) 10 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/multimodal_projector/pooler_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel 7 | import os 8 | 9 | 10 | class PoolerProjector(nn.Module): 11 | def __init__(self, config, vision_cfg): 12 | super().__init__() 13 | self._config = config 14 | self.hw = vision_cfg.image_size // vision_cfg.patch_size 15 | 16 | self.conv_pool = nn.Conv2d( 17 | config.mm_hidden_size, config.hidden_size, 18 | kernel_size=2, stride=2 19 | ) 20 | 21 | self.proj = nn.Sequential( 22 | nn.GELU(), 23 | nn.Linear(config.hidden_size, config.hidden_size), 24 | ) 25 | 26 | def forward(self, x, *args, **kwargs): 27 | height = width = self.hw 28 | assert height * width == x.shape[1] 29 | x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2) 30 | x = self.conv_pool(x) 31 | x = x.flatten(2).transpose(1, 2) 32 | x = self.proj(x) 33 | return x 34 | 35 | @property 36 | def config(self): 37 | return {"mm_projector_type": 'pooler'} 38 | 39 | 40 | class NormalizedDwPooler(nn.Module): 41 | def __init__(self, dim): 42 | super().__init__() 43 | self.dim = dim 44 | self.predictor = nn.Sequential( 45 | nn.Linear(dim*2, dim), 46 | nn.GELU(), 47 | nn.Linear(dim, dim), 48 | ) 49 | 50 | def forward(self, x, forward_type='2x'): 51 | B, H, W, C = x.shape 52 | 53 | if forward_type == '2x': 54 | new_x = x.reshape(B, H//2, 2, W//2, 2, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//2, W//2, 4, C) 55 | pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 4, -1) 56 | fused_x = torch.cat([new_x, pooled_x], dim=-1) 57 | elif forward_type == '1x': 58 | new_x = x.reshape(B, H, W, 1, C) 59 | fused_x = torch.cat([new_x, new_x], dim=-1) 60 | elif forward_type == '4x': 61 | new_x = x.reshape(B, H//4, 4, W//4, 4, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//4, W//4, 16, C) 62 | pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 16, -1) 63 | fused_x = torch.cat([new_x, pooled_x], dim=-1) 64 | 65 | score = self.predictor(fused_x) 66 | normalized_score = F.softmax(score, dim=-2) 67 | new_x = (new_x * normalized_score).sum(dim=-2) 68 | return new_x 69 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .perceiver import DynamicCompressor 4 | 5 | class IdentityMap(torch.nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, x, *args, **kwargs): 10 | return x 11 | 12 | @property 13 | def config(self): 14 | return {"mm_resampler_type": None} 15 | 16 | def build_vision_resampler(model_args, delay_load=False, **kwargs): 17 | # import pdb;pdb.set_trace() 18 | resampler_type = getattr(model_args, 'mm_resampler_type', None) 19 | if resampler_type == 'dynamic_compressor': 20 | return DynamicCompressor(model_args, **kwargs) 21 | elif resampler_type is None: 22 | return IdentityMap() 23 | else: 24 | raise ValueError(f'Unknown resampler type: {resampler_type}') 25 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/multimodal_resampler/perceiver.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import torch.nn.functional as F 5 | import os 6 | 7 | class DynamicCompressor(nn.Module): 8 | def __init__(self, model_args, vision_tower): 9 | super().__init__() 10 | 11 | self.out_channels = vision_tower.hidden_size 12 | self.mid_channel = 256 13 | 14 | self.vlm_query_projector = nn.Linear(self.out_channels, self.mid_channel) 15 | self.vlm_key_projector = nn.Linear(self.out_channels, self.mid_channel) 16 | 17 | def downsample(self, x): 18 | return F.avg_pool2d(x, 2, 2) 19 | 20 | def downsample_4(self, x): 21 | return F.avg_pool2d(x, 4, 4) 22 | 23 | def forward(self, image_features, forward_type, image_size=None): 24 | if image_size is None: 25 | ori_W = int(math.sqrt(image_features.shape[1])) 26 | ori_H = int(ori_W) 27 | else: 28 | ori_H, ori_W = image_size 29 | T, N, C = image_features.shape 30 | image_features = image_features.view(T, ori_H, ori_W, C).permute(0, 3, 1, 2) # T, C, H, W 31 | 32 | if forward_type == 'video': 33 | image_features_pool = self.downsample(image_features) 34 | image_feature_attn = image_features.reshape(T, C, ori_H // 2, 2, ori_W // 2, 2).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H // 2 * ori_W // 2, 4, C) 35 | new_image_size = (ori_H // 2, ori_W // 2) 36 | elif forward_type == 'image' or forward_type == 'text': 37 | image_features_pool = image_features 38 | image_feature_attn = image_features.reshape(T, C, ori_H, 1, ori_W, 1).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H * ori_W, 1, C) 39 | new_image_size = (ori_H, ori_W) 40 | elif forward_type == 'video_long': 41 | image_features_pool = self.downsample_4(image_features) 42 | image_feature_attn = image_features.reshape(T, C, ori_H // 4, 4, ori_W // 4, 4).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H // 4 * ori_W // 4, 16, C) 43 | new_image_size = (ori_H // 4, ori_W // 4) 44 | else: 45 | raise NotImplementedError 46 | 47 | image_features_pool = image_features_pool.flatten(2).permute(0, 2, 1) # T, H*W, C 48 | new_t, new_p, _ = image_features_pool.shape 49 | 50 | image_query = self.vlm_query_projector(image_features_pool).reshape(new_t*new_p, self.mid_channel) 51 | image_key = self.vlm_key_projector(image_feature_attn).reshape(new_t*new_p, -1, self.mid_channel) 52 | 53 | image_value = image_feature_attn.reshape(new_t*new_p, -1, self.out_channels) 54 | # import pdb;pdb.set_trace() 55 | 56 | image_attn = image_query[:,None] @ (image_key.transpose(-1,-2) / (image_key.shape[-1]**0.5)) 57 | image_attn = image_attn.nan_to_num() 58 | attn_feat = (image_attn.softmax(-1) @ image_value).mean(1).reshape(new_t, new_p, C) 59 | 60 | image_features_pool = image_features_pool + attn_feat 61 | 62 | return image_features_pool, new_image_size 63 | 64 | @property 65 | def config(self): 66 | return { 67 | 'mm_resampler_type': 'dynamic_compressor', 68 | 'mm_out_channels': self.out_channels, 69 | } 70 | 71 | @property 72 | def hidden_size(self): 73 | return self.out_channels 74 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/speech_encoder/beats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ola/ola/model/speech_encoder/beats/__init__.py -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/speech_encoder/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_encoder import WhisperWrappedEncoder, DualWrappedEncoder 2 | 3 | 4 | def build_speech_encoder(config): 5 | speech_encoder_type = getattr(config, 'speech_encoder_type', None) 6 | if "whisper" in speech_encoder_type.lower(): 7 | return WhisperWrappedEncoder.load(config) 8 | elif "dual" in speech_encoder_type.lower(): 9 | return DualWrappedEncoder(config) 10 | 11 | raise ValueError(f'Unknown speech encoder: {speech_encoder_type}') 12 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/speech_encoder/speech_encoder.py: -------------------------------------------------------------------------------- 1 | import types 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from transformers import WhisperFeatureExtractor 6 | import whisper 7 | 8 | from ....ola.model.speech_encoder.beats.BEATs import BEATsConfig, BEATs 9 | 10 | class WhisperWrappedEncoder: 11 | 12 | @classmethod 13 | def load(cls, model_config): 14 | 15 | def replace_layer_norm(module): 16 | from whisper.model import LayerNorm 17 | for name, child in module.named_children(): 18 | if isinstance(child, LayerNorm): 19 | old_params = child.state_dict() 20 | new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine) 21 | new_layer_norm.load_state_dict(old_params) 22 | setattr(module, name, new_layer_norm) 23 | else: 24 | replace_layer_norm(child) 25 | 26 | encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder 27 | replace_layer_norm(encoder) 28 | return encoder 29 | 30 | class DualWrappedEncoder(nn.Module): 31 | def __init__(self, config): 32 | super().__init__() 33 | self.config = config 34 | self.whisper_model = self.load_whisper(config) 35 | self.beats_model = self.load_beats(config) 36 | 37 | def load_whisper(cls, model_config): 38 | 39 | def replace_layer_norm(module): 40 | from whisper.model import LayerNorm 41 | for name, child in module.named_children(): 42 | if isinstance(child, LayerNorm): 43 | old_params = child.state_dict() 44 | new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine) 45 | new_layer_norm.load_state_dict(old_params) 46 | setattr(module, name, new_layer_norm) 47 | else: 48 | replace_layer_norm(child) 49 | 50 | encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder 51 | replace_layer_norm(encoder) 52 | return encoder 53 | 54 | def load_beats(cls, model_config): 55 | beats_path = model_config.music_encoder 56 | print("Loading BEATs Model") 57 | beats_ckpt = torch.load(beats_path, map_location='cpu') 58 | beats_cfg = BEATsConfig(beats_ckpt['cfg']) 59 | beats = BEATs(beats_cfg) 60 | beats.load_state_dict(beats_ckpt['model']) 61 | return beats 62 | 63 | def forward(self, x, raw_wav=None, audio_padding_mask=None): 64 | with torch.no_grad(): 65 | self.beats_model = self.beats_model.float() 66 | speech_embeds = self.whisper_model(x.half()) 67 | audio_embeds, _ = self.beats_model.extract_features(raw_wav.float(), padding_mask=audio_padding_mask, feature_only=True) 68 | if audio_embeds.size(1) < speech_embeds.size(1): 69 | audio_embeds = F.pad(audio_embeds, (0, 0, 0, speech_embeds.size(1) - audio_embeds.size(1))) 70 | elif audio_embeds.size(1) > speech_embeds.size(1): 71 | speech_embeds = F.pad(speech_embeds, (0, 0, 0, audio_embeds.size(1) - speech_embeds.size(1))) 72 | speech_embeds = torch.cat((speech_embeds, audio_embeds), dim=-1) 73 | speech_embeds = speech_embeds.to(torch.bfloat16) 74 | return speech_embeds 75 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/speech_projector/builder.py: -------------------------------------------------------------------------------- 1 | from .speech_projector import EncoderProjectorConcat 2 | 3 | 4 | def build_speech_projector(config): 5 | projector_type = getattr(config, 'speech_projector_type', 'linear') 6 | if projector_type == 'linear': 7 | return EncoderProjectorConcat(config) 8 | 9 | raise ValueError(f'Unknown projector type: {projector_type}') 10 | -------------------------------------------------------------------------------- /vlmeval/vlm/ola/ola/model/speech_projector/speech_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | class EncoderProjectorConcat(nn.Module): 6 | def __init__(self, config): 7 | super().__init__() 8 | self.k = config.speech_encoder_ds_rate 9 | self.encoder_dim = config.speech_encoder_hidden_size 10 | self.llm_dim = config.hidden_size 11 | self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048) 12 | self.relu = nn.ReLU() 13 | self.linear2 = nn.Linear(2048, config.hidden_size) 14 | 15 | embed_std = 1 / math.sqrt(config.hidden_size) 16 | self.speech_newline = nn.Parameter( 17 | torch.randn(config.hidden_size) * embed_std 18 | ) 19 | self.speech_begin = nn.Parameter( 20 | torch.randn(config.hidden_size) * embed_std 21 | ) 22 | self.speech_end = nn.Parameter( 23 | torch.randn(config.hidden_size) * embed_std 24 | ) 25 | 26 | def forward(self, x): 27 | batch_size, seq_len, dim = x.size() 28 | num_frames_to_discard = seq_len % self.k 29 | if num_frames_to_discard > 0: 30 | x = x[:, :-num_frames_to_discard, :] 31 | seq_len = x.size(1) 32 | 33 | x = x.contiguous() 34 | x = x.view(batch_size, seq_len // self.k, dim * self.k) 35 | x = self.linear1(x) 36 | x = self.relu(x) 37 | x = self.linear2(x) 38 | x = torch.cat([ 39 | x, 40 | self.speech_newline.reshape(1, 1, -1).expand(batch_size, 1, -1).to(x.dtype) 41 | ], dim=1) 42 | begin = self.speech_begin.reshape(1, -1).to(x.dtype) 43 | end = self.speech_end.reshape(1, -1).to(x.dtype) 44 | x = x.flatten(0, 1) 45 | x = torch.cat([begin, x, end], dim=0) 46 | # x = x.flatten(0, 1) 47 | return x 48 | -------------------------------------------------------------------------------- /vlmeval/vlm/ovis/__init__.py: -------------------------------------------------------------------------------- 1 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2 2 | 3 | __all__ = ['Ovis', 'Ovis1_6', 'Ovis1_6_Plus', 'Ovis2'] 4 | -------------------------------------------------------------------------------- /vlmeval/vlm/ovis/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ovis/utils/__init__.py -------------------------------------------------------------------------------- /vlmeval/vlm/pandagpt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import os.path as osp 4 | import warnings 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class PandaGPT(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name, root=None, **kwargs): 15 | if root is None: 16 | raise ValueError('Please set `root` to PandaGPT code directory, which is cloned from here: ') 17 | 18 | assert name == 'PandaGPT_13B' 19 | self.name = name 20 | sys.path.append(osp.join(root, 'code')) 21 | try: 22 | from model.openllama import OpenLLAMAPEFTModel 23 | except Exception as e: 24 | logging.critical( 25 | 'Please first install PandaGPT and set the root path to use PandaGPT, ' 26 | 'which is cloned from here: https://github.com/yxuansu/PandaGPT. ' 27 | ) 28 | raise e 29 | 30 | self.args = { 31 | 'model': 'openllama_peft', 32 | 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'), 33 | 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'), 34 | 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'), 35 | 'stage': 2, 36 | 'max_tgt_len': 512, 37 | 'lora_r': 32, 38 | 'lora_alpha': 32, 39 | 'lora_dropout': 0.1, 40 | } 41 | model = OpenLLAMAPEFTModel(**self.args) 42 | delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) 43 | model.load_state_dict(delta_ckpt, strict=False) 44 | torch.cuda.empty_cache() 45 | self.model = model.eval().half().cuda() 46 | kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} 47 | kwargs_default.update(kwargs) 48 | self.kwargs = kwargs_default 49 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 53 | struct = { 54 | 'prompt': prompt, 55 | 'image_paths': [image_path], 56 | 'audio_paths': [], 57 | 'video_paths': [], 58 | 'thermal_paths': [], 59 | 'modality_embeds': [] 60 | } 61 | struct.update(self.kwargs) 62 | resp = self.model.generate(struct) 63 | return resp 64 | -------------------------------------------------------------------------------- /vlmeval/vlm/phi4_multimodal.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class Phi4Multimodal(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='microsoft/Phi-4-multimodal-instruct', **kwargs): 14 | try: 15 | from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig 16 | except Exception as e: 17 | logging.critical('Please install the latest version transformers.') 18 | raise e 19 | 20 | model = AutoModelForCausalLM.from_pretrained( 21 | model_path, device_map='cuda', trust_remote_code=True, 22 | torch_dtype='auto',attn_implementation='flash_attention_2' 23 | ).eval() 24 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 25 | generation_config = GenerationConfig.from_pretrained(model_path) 26 | 27 | self.model = model 28 | self.processor = processor 29 | # self.kwargs = kwargs 30 | self.generation_config = generation_config 31 | 32 | def generate_inner(self, message, dataset=None): 33 | user_question = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text']) 34 | images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image'] 35 | 36 | user_prompt = '<|user|>' 37 | assistant_prompt = '<|assistant|>' 38 | prompt_suffix = '<|end|>' 39 | prompt = f'{user_prompt}<|image_1|>{user_question}{prompt_suffix}{assistant_prompt}' 40 | inputs = self.processor(text=prompt, images=images[0], return_tensors='pt').to('cuda') 41 | 42 | # Generate response 43 | generate_ids = self.model.generate( 44 | **inputs, 45 | max_new_tokens=1000, 46 | generation_config=self.generation_config, 47 | ) 48 | generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] 49 | response = self.processor.batch_decode( 50 | generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False 51 | )[0] 52 | return response 53 | -------------------------------------------------------------------------------- /vlmeval/vlm/pixtral.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from .base import BaseModel 4 | from ..smp import * 5 | import warnings 6 | from huggingface_hub import snapshot_download 7 | 8 | 9 | class Pixtral(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs): 15 | 16 | self.model_path = model_path 17 | try: 18 | from mistral_inference.transformer import Transformer 19 | from mistral_common.tokens.tokenizers.mistral import MistralTokenizer 20 | except ImportError as err: 21 | logging.critical('Please install `mistral-inference` and `mistral_common`') 22 | raise err 23 | 24 | if os.path.exists(model_path): 25 | cache_path = model_path 26 | else: 27 | if get_cache_path(model_path) is None: 28 | snapshot_download(repo_id=model_path) 29 | cache_path = get_cache_path(self.model_path, repo_type='models') 30 | 31 | self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json') 32 | model = Transformer.from_folder(cache_path, device='cpu') 33 | model.cuda() 34 | self.model = model 35 | self.max_tokens = 2048 36 | 37 | def generate_inner(self, message, dataset=None): 38 | try: 39 | from mistral_inference.generate import generate 40 | from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk 41 | from mistral_common.protocol.instruct.request import ChatCompletionRequest 42 | except ImportError as err: 43 | logging.critical('Please install `mistral-inference` and `mistral_common`') 44 | raise err 45 | 46 | msg_new = [] 47 | for msg in message: 48 | tp, val = msg['type'], msg['value'] 49 | if tp == 'text': 50 | msg_new.append(TextChunk(text=val)) 51 | elif tp == 'image': 52 | b64 = encode_image_file_to_base64(val) 53 | image_url = f'data:image/jpeg;base64,{b64}' 54 | msg_new.append(ImageURLChunk(image_url=image_url)) 55 | 56 | completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)]) 57 | encoded = self.tokenizer.encode_chat_completion(completion_request) 58 | images = encoded.images 59 | tokens = encoded.tokens 60 | 61 | out_tokens, _ = generate( 62 | [tokens], 63 | self.model, 64 | images=[images], 65 | max_tokens=self.max_tokens, 66 | temperature=0, 67 | eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id) 68 | 69 | result = self.tokenizer.decode(out_tokens[0]) 70 | return result 71 | -------------------------------------------------------------------------------- /vlmeval/vlm/qh_360vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import os.path as osp 5 | from PIL import Image 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..dataset import DATASET_TYPE 9 | 10 | 11 | class QH_360VL(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='qihoo360/360VL-70B', **kwargs): 17 | assert model_path is not None 18 | self.model_path = model_path 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 20 | self.model = AutoModelForCausalLM.from_pretrained(model_path, 21 | torch_dtype=torch.float16, 22 | low_cpu_mem_usage=True, 23 | device_map='auto', 24 | trust_remote_code=True).eval() 25 | vision_tower = self.model.get_vision_tower() 26 | vision_tower.load_model() 27 | vision_tower.to(device='cuda', dtype=torch.float16) 28 | self.image_processor = vision_tower.image_processor 29 | self.tokenizer.pad_token = self.tokenizer.eos_token 30 | self.kwargs = kwargs 31 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 32 | torch.cuda.empty_cache() 33 | 34 | def generate(self, message, dataset=None): 35 | 36 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 37 | print(prompt) 38 | image = Image.open(image_path).convert('RGB') 39 | terminators = [ 40 | self.tokenizer.convert_tokens_to_ids('<|eot_id|>',) 41 | ] 42 | inputs = self.model.build_conversation_input_ids(self.tokenizer, 43 | query=prompt, 44 | image=image, 45 | image_processor=self.image_processor) 46 | input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True) 47 | images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True) 48 | 49 | output_ids = self.model.generate(input_ids=input_ids, 50 | images=images, 51 | do_sample=False, 52 | num_beams=1, 53 | max_new_tokens=512, 54 | eos_token_id=terminators, 55 | use_cache=True) 56 | 57 | input_token_len = input_ids.shape[1] 58 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 59 | response = outputs.strip() 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /vlmeval/vlm/qwen2_vl/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Qwen2VLChat, Qwen2VLChatAguvis 2 | from .prompt import Qwen2VLPromptMixin 3 | -------------------------------------------------------------------------------- /vlmeval/vlm/slime.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from abc import abstractproperty 4 | import sys 5 | import os.path as osp 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..dataset import DATASET_TYPE 9 | import copy 10 | 11 | 12 | class SliME(BaseModel): 13 | 14 | INSTALL_REQ = True 15 | INTERLEAVE = True 16 | 17 | DEFAULT_IMAGE_TOKEN = '' 18 | IMAGE_TOKEN_INDEX = -200 19 | 20 | def __init__(self, model_path='yifanzhang114/SliME-Llama3-8B', **kwargs): 21 | assert model_path is not None 22 | try: 23 | from llava.model.builder import load_pretrained_model 24 | from llava.conversation import conv_templates 25 | from llava.mm_utils import get_model_name_from_path, tokenizer_image_token 26 | except Exception as err: 27 | logging.critical('Please install requirements on https://github.com/yfzhang114/SliME before using SliME') 28 | raise err 29 | 30 | model_name = get_model_name_from_path(model_path) 31 | tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name, device_map=None) 32 | model.cuda().eval() 33 | model.tie_weights() 34 | 35 | if 'llama3' in model_path.lower(): 36 | conv_mode = 'llama3' 37 | elif 'vicuna' in model_path.lower(): 38 | conv_mode = 'v1' 39 | self.conv_template = conv_mode 40 | self.conv_templates = conv_templates 41 | self.tokenizer = tokenizer 42 | self.model = model 43 | self.image_processor = image_processor 44 | self.tokenizer_image_token = tokenizer_image_token 45 | 46 | def generate_inner(self, message, dataset=None): 47 | content, images = '', [] 48 | for msg in message: 49 | if msg['type'] == 'text': 50 | content += msg['value'] 51 | else: 52 | images.append(Image.open(msg['value']).convert('RGB')) 53 | content += (self.DEFAULT_IMAGE_TOKEN + '\n') 54 | 55 | preprocess = self.image_processor.preprocess 56 | image_tokenizer = self.tokenizer_image_token 57 | image_tensor = [ 58 | preprocess(f, return_tensors='pt')['pixel_values'][0].half().cuda() for f in images 59 | ] 60 | image_tensor = torch.stack(image_tensor) 61 | 62 | conv = copy.deepcopy(self.conv_templates[self.conv_template]) 63 | conv.messages = list(conv.messages) 64 | conv.append_message(conv.roles[0], content) 65 | conv.append_message(conv.roles[1], None) 66 | prompt_question = conv.get_prompt() 67 | 68 | input_ids = image_tokenizer(prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors='pt') 69 | input_ids = input_ids.unsqueeze(0).cuda() 70 | 71 | cont = self.model.generate( 72 | input_ids, 73 | images=image_tensor, 74 | do_sample=False, 75 | temperature=0, 76 | max_new_tokens=512, 77 | ) 78 | text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0] 79 | return text_outputs 80 | -------------------------------------------------------------------------------- /vlmeval/vlm/ursa/__init__.py: -------------------------------------------------------------------------------- 1 | from .ursa_chat import UrsaChat 2 | -------------------------------------------------------------------------------- /vlmeval/vlm/ursa/ursa_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from .image_processing_vlm import VLMImageProcessor, VLMImageProcessorConfig 15 | from .modeling_ursa import UrsaForConditionalGeneration, UrsaForTokenClassification 16 | from .processing_ursa import UrsaProcessor 17 | from .configuration_ursa import VisionConfig, UrsaConfig, AlignerConfig 18 | from .projector import MlpProjector 19 | 20 | __all__ = [ 21 | "VLMImageProcessor", 22 | "UrsaProcessor", 23 | "UrsaForConditionalGeneration", 24 | "UrsaForTokenClassification", 25 | "VLMImageProcessorConfig", 26 | "VisionConfig", 27 | "MlpProjector", 28 | "AlignerConfig", 29 | "UrsaConfig" 30 | ] 31 | -------------------------------------------------------------------------------- /vlmeval/vlm/ursa/ursa_model/processing_ursa.py: -------------------------------------------------------------------------------- 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from typing import List, Optional, Union 15 | 16 | from transformers.feature_extraction_utils import BatchFeature 17 | from transformers.image_utils import ImageInput 18 | from transformers.processing_utils import ProcessorMixin 19 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy 20 | from transformers.utils import TensorType 21 | 22 | 23 | class UrsaProcessor(ProcessorMixin): 24 | attributes = ["image_processor", "tokenizer"] 25 | valid_kwargs = ["chat_template"] 26 | image_processor_class = "AutoImageProcessor" 27 | tokenizer_class = "AutoTokenizer" 28 | 29 | def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): 30 | super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs) 31 | 32 | def __call__( 33 | self, 34 | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, 35 | images: ImageInput = None, 36 | padding: Union[bool, str, PaddingStrategy] = False, 37 | truncation: Union[bool, str, TruncationStrategy] = None, 38 | max_length=None, 39 | return_tensors: Optional[Union[str, TensorType]] = None, # or TensorType.PYTORCH 40 | ) -> BatchFeature: 41 | image_inputs = {} 42 | if images is not None: 43 | image_inputs = self.image_processor(images, return_tensors=return_tensors) 44 | text_inputs = self.tokenizer( 45 | text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length 46 | ) 47 | return BatchFeature(data={**text_inputs, **image_inputs}) 48 | 49 | def decode(self, *args, **kwargs): 50 | return self.tokenizer.decode(*args, **kwargs) 51 | 52 | def batch_decode(self, *args, **kwargs): 53 | return self.tokenizer.batch_decode(*args, **kwargs) 54 | 55 | @property 56 | # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names 57 | def model_input_names(self): 58 | tokenizer_input_names = self.tokenizer.model_input_names 59 | image_processor_input_names = self.image_processor.model_input_names 60 | return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) 61 | -------------------------------------------------------------------------------- /vlmeval/vlm/valley/__init__.py: -------------------------------------------------------------------------------- 1 | from .valley import Valley2Chat 2 | -------------------------------------------------------------------------------- /vlmeval/vlm/valley/requirements_valley.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.34.2 2 | bert-score==0.3.13 3 | byted-wandb==0.13.72 4 | datasets==2.21.0 5 | einops==0.8.0 6 | evaluate==0.4.3 7 | fastapi==0.115.0 8 | flash_attn 9 | ftfy==6.2.3 10 | markdown2==2.5.0 11 | ninja==1.11.1.1 12 | nltk==3.9.1 13 | numpy==1.26.4 14 | omegaconf==2.3.0 15 | openai==0.28 16 | opencv-python-headless==4.10.0.84 17 | packaging==24.1 18 | pandas==2.2.2 19 | peft==0.5.0 20 | prettytable==3.11.0 21 | protobuf==3.20.3 22 | pyarrow==15.0.0 23 | pydantic==1.10.14 24 | qwen_vl_utils 25 | requests==2.32.3 26 | rouge-score==0.1.2 27 | scikit-image==0.24.0 28 | scikit-learn==1.5.2 29 | sentencepiece==0.1.97 30 | timm==0.6.7 31 | tokenizers>=0.13.3 32 | torchmetrics 33 | transformers==4.45.2 34 | uvicorn==0.30.6 35 | -------------------------------------------------------------------------------- /vlmeval/vlm/video_llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .video_llava import VideoLLaVA, VideoLLaVA_HF 2 | from .videochat2 import VideoChat2_HD 3 | from .chat_uni_vi import Chatunivi 4 | from .video_chatgpt import VideoChatGPT 5 | from .llama_vid import LLaMAVID 6 | from .pllava import PLLaVA 7 | 8 | __all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA'] 9 | -------------------------------------------------------------------------------- /vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": 224, 3 | "do_center_crop": true, 4 | "do_normalize": true, 5 | "do_resize": true, 6 | "feature_extractor_type": "CLIPFeatureExtractor", 7 | "image_mean": [ 8 | 0.48145466, 9 | 0.4578275, 10 | 0.40821073 11 | ], 12 | "image_std": [ 13 | 0.26862954, 14 | 0.26130258, 15 | 0.27577711 16 | ], 17 | "resample": 3, 18 | "size": 224 19 | } 20 | -------------------------------------------------------------------------------- /vlmeval/vlm/video_llm/configs/videochat2_hd.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_cls": "VideoChat2_it_hd_mistral", 4 | "vit_blip_model_path": "OpenGVLab/videochat2", 5 | "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2", 6 | "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B", 7 | "freeze_vit": false, 8 | "freeze_qformer": false, 9 | "max_txt_len": 512, 10 | "low_resource": false, 11 | "vision_encoder": { 12 | "name": "vit_l14", 13 | "img_size": 224, 14 | "patch_size": 16, 15 | "d_model": 1024, 16 | "encoder_embed_dim": 1024, 17 | "encoder_depth": 24, 18 | "encoder_num_heads": 16, 19 | "drop_path_rate": 0.0, 20 | "num_frames": 8, 21 | "tubelet_size": 1, 22 | "use_checkpoint": true, 23 | "checkpoint_num": 18, 24 | "pretrained": "", 25 | "return_index": -2, 26 | "vit_add_ln": true, 27 | "ckpt_num_frame": 4 28 | }, 29 | "num_query_token": 32, 30 | "qformer_hidden_dropout_prob": 0.1, 31 | "qformer_attention_probs_dropout_prob": 0.1, 32 | "qformer_drop_path_rate": 0.2, 33 | "extra_num_query_token": 64, 34 | "qformer_text_input": true, 35 | "system": "", 36 | "start_token": "", 38 | "add_second_msg": true, 39 | "img_start_token": "", 40 | "img_end_token": "", 41 | "random_shuffle": true, 42 | "return_question_instruction": false, 43 | "use_flash_attention": true, 44 | "use_lora": false, 45 | "lora_r": 16, 46 | "lora_alpha": 32, 47 | "lora_dropout": 0.1, 48 | "dynamic_config": { 49 | "local_size": 224, 50 | "hd_num": 6, 51 | "padding": false, 52 | "add_global": true 53 | } 54 | }, 55 | "device": "cuda" 56 | } 57 | -------------------------------------------------------------------------------- /vlmeval/vlm/video_llm/video_chatgpt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import warnings 4 | import copy as cp 5 | import numpy as np 6 | import sys 7 | import logging 8 | from ..base import BaseModel 9 | from ...smp import isimg, listinstr 10 | from ...dataset import DATASET_TYPE 11 | from huggingface_hub import snapshot_download 12 | 13 | 14 | class VideoChatGPT(BaseModel): 15 | INSTALL_REQ = True 16 | INTERLEAVE = False 17 | VIDEO_LLM = True 18 | # sample a video in 100 frames 19 | 20 | def __init__(self, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=None, **kwargs): 21 | assert model_path is not None 22 | sys.path.append(dir_root) 23 | try: 24 | from video_chatgpt.eval.model_utils import initialize_model 25 | except Exception as err: 26 | logging.critical( 27 | 'Please first install requirements and set the root path to use Video-ChatGPT. \ 28 | Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.' 29 | ) 30 | raise err 31 | base_model_path = snapshot_download('mmaaz60/LLaVA-7B-Lightening-v1-1') 32 | projection_path = snapshot_download(model_path) 33 | projection_name = 'video_chatgpt-7B.bin' 34 | projection_path = os.path.join(projection_path, projection_name) 35 | 36 | model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model( 37 | base_model_path, projection_path 38 | ) 39 | self.tokenizer = tokenizer 40 | self.model = model 41 | self.processor = image_processor 42 | self.context_len = video_token_len 43 | self.kwargs = kwargs 44 | self.vision_tower = vision_tower 45 | 46 | def get_model_output(self, model, video_processor, tokenizer, video, qs): 47 | from video_chatgpt.eval.model_utils import load_video 48 | from video_chatgpt.inference import video_chatgpt_infer 49 | conv_mode = 'video-chatgpt_v1' 50 | 51 | video_frames = load_video(video) 52 | # Run inference on the video and questions 53 | output = video_chatgpt_infer( 54 | video_frames, 55 | qs, 56 | conv_mode, 57 | model, 58 | self.vision_tower, 59 | tokenizer, 60 | video_processor, 61 | self.context_len, 62 | ) 63 | return output 64 | 65 | def generate_inner(self, message, dataset=None): 66 | question, video = self.message_to_promptvideo(message) 67 | response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question) 68 | return response 69 | -------------------------------------------------------------------------------- /vlmeval/vlm/visualglm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel 3 | from ..smp import * 4 | 5 | 6 | class VisualGLM(BaseModel): 7 | 8 | INSTALL_REQ = False 9 | INTERLEAVE = False 10 | 11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs): 12 | try: 13 | import sat 14 | except Exception as err: 15 | logging.critical('Please install SwissArmyTransformer to use VisualGLM') 16 | raise err 17 | 18 | assert model_path is not None 19 | self.model_path = model_path 20 | 21 | from transformers import AutoModel 22 | from transformers import AutoTokenizer 23 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 24 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() 25 | self.model = model 26 | self.kwargs = kwargs 27 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 28 | 29 | def generate_inner(self, message, dataset=None): 30 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 31 | output, _ = self.model.chat( 32 | image_path=image_path, 33 | tokenizer=self.tokenizer, 34 | query=prompt, 35 | history=[], 36 | **self.kwargs 37 | ) 38 | return output 39 | -------------------------------------------------------------------------------- /vlmeval/vlm/wemm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import sys 4 | from ..smp import * 5 | from .base import BaseModel 6 | from ..dataset import DATASET_TYPE 7 | from transformers import AutoModel, GenerationConfig 8 | 9 | 10 | class WeMM(BaseModel): 11 | def __init__(self, model_path='feipengma/WeMM', **kwargs): 12 | self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True) 13 | self.wemm.cuda() 14 | self.wemm.eval() 15 | torch.cuda.empty_cache() 16 | 17 | def use_custom_prompt(self, dataset): 18 | assert dataset is not None 19 | if DATASET_TYPE(dataset) == 'MCQ': 20 | return True 21 | return False 22 | 23 | def build_prompt(self, line, dataset=None): 24 | assert self.use_custom_prompt(dataset) 25 | assert dataset is None or isinstance(dataset, str) 26 | tgt_path = self.dump_image(line, dataset) 27 | question = line['question'] 28 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 29 | if hint is not None: 30 | question = hint + '\n' + question 31 | options = { 32 | cand: line[cand] 33 | for cand in string.ascii_uppercase 34 | if cand in line and not pd.isna(line[cand]) 35 | } 36 | for key, item in options.items(): 37 | question += f'\n{key}. {item}' 38 | prompt = question 39 | 40 | if len(options): 41 | prompt += ( 42 | '\n请直接回答选项字母。' if cn_string(prompt) else 43 | "\nAnswer with the option's letter from the given choices directly." 44 | ) 45 | else: 46 | prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' 47 | 48 | message = [dict(type='text', value=prompt)] 49 | message.extend([dict(type='image', value=p) for p in tgt_path]) 50 | return message 51 | 52 | def generate_inner(self, message, dataset=None): 53 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 54 | 55 | if dataset == 'HallusionBench': 56 | prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.' 57 | 58 | gen_config = None 59 | if dataset == 'MMVet': 60 | gen_config = GenerationConfig( 61 | max_new_tokens=512, 62 | do_sample=True, 63 | temperatures=0.7, 64 | num_beams=3, 65 | eos_token_id=self.wemm.tokenizer.eos_token_id, 66 | pad_token_id=self.wemm.tokenizer.pad_token_id 67 | if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id, 68 | ) 69 | pred = self.wemm.mm_generate(image_path, prompt, gen_config) 70 | 71 | return pred 72 | -------------------------------------------------------------------------------- /vlmeval/vlm/xcomposer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sharecaptioner import ShareCaptioner 2 | from .xcomposer import XComposer 3 | from .xcomposer2 import XComposer2 4 | from .xcomposer2_4KHD import XComposer2_4KHD 5 | from .xcomposer2d5 import XComposer2d5 6 | 7 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5'] 8 | -------------------------------------------------------------------------------- /vlmeval/vlm/xgen_mm.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class XGenMM(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = True 12 | 13 | def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs): 14 | try: 15 | from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor 16 | except Exception as err: 17 | logging.critical('Please install the latest version transformers.') 18 | raise err 19 | 20 | model = AutoModelForVision2Seq.from_pretrained( 21 | model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto' 22 | ).eval() 23 | 24 | tokenizer = AutoTokenizer.from_pretrained( 25 | model_path, trust_remote_code=True, use_fast=False, legacy=False 26 | ) 27 | tokenizer = model.update_special_tokens(tokenizer) 28 | tokenizer.eos_token = '<|end|>' 29 | tokenizer.padding_side = 'left' 30 | image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True) 31 | self.model = model 32 | self.image_processor = image_processor 33 | self.tokenizer = tokenizer 34 | self.kwargs = kwargs 35 | 36 | def apply_prompt_template(self, query): 37 | s = ( 38 | '<|system|>\nA chat between a curious user and an artificial intelligence assistant. ' 39 | "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n" 40 | f'<|user|>\n{query}<|end|>\n<|assistant|>\n' 41 | ) 42 | return s 43 | 44 | def generate_inner(self, message, dataset=None): 45 | 46 | content, images, image_sizes = '', [], [] 47 | 48 | for msg in message: 49 | if msg['type'] == 'text': 50 | content += msg['value'] 51 | elif msg['type'] == 'image': 52 | image = Image.open(msg['value']).convert('RGB') 53 | images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda')) 54 | image_sizes.append(image.size) 55 | content += ' ' 56 | 57 | inputs = {'pixel_values': [images]} 58 | prompt = self.apply_prompt_template(content) 59 | language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda') 60 | inputs.update(language_inputs) 61 | 62 | generation_args = { 63 | 'max_new_tokens': 1024, 64 | 'temperature': 0.0, 65 | 'do_sample': False, 66 | 'top_p': None, 67 | 'num_beams': 1 68 | } 69 | generation_args.update(self.kwargs) 70 | 71 | generate_ids = self.model.generate( 72 | **inputs, image_size=[image_sizes], 73 | pad_token_id=self.tokenizer.pad_token_id, 74 | eos_token_id=self.tokenizer.eos_token_id, 75 | **generation_args 76 | ) 77 | 78 | # remove input tokens 79 | response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0] 80 | 81 | return response 82 | --------------------------------------------------------------------------------