├── .github
    ├── scripts
    │   └── assert_score.py
    └── workflows
    │   ├── lint.yml
    │   └── pr-run-test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── assets
    ├── LOGO.svg
    └── apple.jpg
├── docs
    ├── en
    │   ├── .readthedocs.yaml
    │   ├── ConfigSystem.md
    │   ├── Contributors.md
    │   ├── Development.md
    │   ├── EvalByLMDeploy.md
    │   ├── Makefile
    │   ├── Quickstart.md
    │   ├── _static
    │   │   ├── css
    │   │   │   └── readthedocs.css
    │   │   ├── image
    │   │   │   ├── logo.svg
    │   │   │   └── logo_icon.svg
    │   │   └── js
    │   │   │   └── custom.js
    │   ├── _templates
    │   │   ├── 404.html
    │   │   ├── autosummary
    │   │   │   └── class.rst
    │   │   └── callable.rst
    │   ├── conf.py
    │   ├── docutils.conf
    │   └── index.rst
    ├── ja
    │   └── README_ja.md
    └── zh-CN
    │   ├── .readthedocs.yaml
    │   ├── ConfigSystem.md
    │   ├── Development.md
    │   ├── EvalByLMDeploy.md
    │   ├── Makefile
    │   ├── Quickstart.md
    │   ├── README_zh-CN.md
    │   ├── _static
    │       ├── css
    │       │   └── readthedocs.css
    │       ├── image
    │       │   ├── logo.svg
    │       │   └── logo_icon.svg
    │       └── js
    │       │   └── custom.js
    │   ├── _templates
    │       ├── 404.html
    │       ├── autosummary
    │       │   └── class.rst
    │       └── callable.rst
    │   ├── conf.py
    │   ├── cp_origin_docs.sh
    │   ├── docutils.conf
    │   └── index.rst
├── requirements.txt
├── requirements
    └── docs.txt
├── run.py
├── scripts
    ├── AI2D_preproc.ipynb
    ├── apires_scan.py
    ├── auto_run.py
    ├── cover.sh
    ├── data_browser.py
    ├── mmb_eval_gradio.py
    ├── run.sh
    ├── srun.sh
    ├── summarize.py
    └── visualize.ipynb
├── setup.py
└── vlmeval
    ├── __init__.py
    ├── api
        ├── __init__.py
        ├── bailingmm.py
        ├── base.py
        ├── bluelm_v_api.py
        ├── claude.py
        ├── cloudwalk.py
        ├── doubao_vl_api.py
        ├── gemini.py
        ├── glm_vision.py
        ├── gpt.py
        ├── hf_chat_model.py
        ├── hunyuan.py
        ├── jt_vl_chat.py
        ├── lmdeploy.py
        ├── mug_u.py
        ├── qwen_api.py
        ├── qwen_vl_api.py
        ├── reka.py
        ├── sensechat_vision.py
        ├── siliconflow.py
        ├── stepai.py
        ├── taichu.py
        └── taiyi.py
    ├── config.py
    ├── dataset
        ├── GUI
        │   ├── __init__.py
        │   ├── screenspot.py
        │   ├── screenspot_pro.py
        │   └── screenspot_v2.py
        ├── Omnidocbench
        │   ├── __init__.py
        │   ├── data_preprocess.py
        │   ├── metrics.py
        │   ├── omnidocbench.py
        │   ├── requirements.txt
        │   └── utils.py
        ├── __init__.py
        ├── cgbench.py
        ├── charxiv.py
        ├── cmmmu.py
        ├── creation.py
        ├── dude.py
        ├── dynamath.py
        ├── emma.py
        ├── image_base.py
        ├── image_caption.py
        ├── image_ccocr.py
        ├── image_mcq.py
        ├── image_mt.py
        ├── image_shortqa.py
        ├── image_vqa.py
        ├── image_yorn.py
        ├── longvideobench.py
        ├── megabench.py
        ├── miabench.py
        ├── mlvu.py
        ├── mmalignbench.py
        ├── mmbench_video.py
        ├── mmgenbench.py
        ├── mmifeval.py
        ├── mmlongbench.py
        ├── mmmath.py
        ├── moat.py
        ├── moviechat1k.py
        ├── mvbench.py
        ├── qbench_video.py
        ├── slidevqa.py
        ├── spatial457.py
        ├── tamperbench.py
        ├── tempcompass.py
        ├── text_base.py
        ├── text_mcq.py
        ├── utils
        │   ├── __init__.py
        │   ├── ccocr_evaluator
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── common.py
        │   │   ├── doc_parsing_evaluator.py
        │   │   ├── kie_evaluator.py
        │   │   └── ocr_evaluator.py
        │   ├── cgbench.py
        │   ├── crpe.py
        │   ├── hrbench.py
        │   ├── judge_util.py
        │   ├── llavabench.py
        │   ├── logicvista.py
        │   ├── longvideobench.py
        │   ├── mathv.py
        │   ├── mathverse.py
        │   ├── mathvista.py
        │   ├── megabench
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── aggregation
        │   │   │   ├── mean_agg.py
        │   │   │   ├── min_agg.py
        │   │   │   └── unsupported_agg.py
        │   │   ├── aggregation_type.py
        │   │   ├── evaluator.py
        │   │   ├── metric_type.py
        │   │   ├── parsing
        │   │   │   ├── answer_str_parse.py
        │   │   │   ├── common
        │   │   │   │   ├── parsers.py
        │   │   │   │   └── utils.py
        │   │   │   ├── dummy_parse.py
        │   │   │   └── json_parse.py
        │   │   ├── requirements.txt
        │   │   ├── response_parse_type.py
        │   │   ├── scoring
        │   │   │   ├── ascii_art_gpt4o_judge.py
        │   │   │   ├── chess_jaccard.py
        │   │   │   ├── common
        │   │   │   │   ├── conversions.py
        │   │   │   │   ├── metrics.py
        │   │   │   │   └── transformations.py
        │   │   │   ├── constrained_generation.py
        │   │   │   ├── coordinate_sequence_match.py
        │   │   │   ├── dict_equality.py
        │   │   │   ├── dict_exact_match_agg_recall.py
        │   │   │   ├── dict_jaccard_agg_jaccard.py
        │   │   │   ├── dict_nbbox_iou_tuple_agg_jaccard.py
        │   │   │   ├── dict_set_equality_agg_jaccard.py
        │   │   │   ├── exact_str_match.py
        │   │   │   ├── exact_str_match_case_insensitive.py
        │   │   │   ├── general_numerical_match.py
        │   │   │   ├── geo_proximity.py
        │   │   │   ├── gleu.py
        │   │   │   ├── jaccard.py
        │   │   │   ├── latex_expr_equality.py
        │   │   │   ├── longest_common_list_prefix_ratio.py
        │   │   │   ├── mse.py
        │   │   │   ├── multi_ref_phrase.py
        │   │   │   ├── nbbox_iou.py
        │   │   │   ├── near_str_match.py
        │   │   │   ├── nli_entailment.py
        │   │   │   ├── normalized_similarity_damerau_levenshtein.py
        │   │   │   ├── number_rel_diff_ratio.py
        │   │   │   ├── positive_int_match.py
        │   │   │   ├── program_judge.py
        │   │   │   ├── sacrebleu_bleu.py
        │   │   │   ├── sequence_equality.py
        │   │   │   ├── set_equality.py
        │   │   │   ├── set_precision.py
        │   │   │   ├── simple_str_match.py
        │   │   │   ├── symbolic_planning.py
        │   │   │   ├── unsupported_scoring.py
        │   │   │   ├── vlm_as_judge.py
        │   │   │   ├── xml_nbbox_iou.py
        │   │   │   ├── xml_norm_point_distance.py
        │   │   │   └── xml_norm_point_in_bbox.py
        │   │   └── utils.py
        │   ├── mlvu.py
        │   ├── mmbench_video.py
        │   ├── mmdu.py
        │   ├── mmif
        │   │   ├── __init__.py
        │   │   └── function_and_compare.py
        │   ├── mmniah.py
        │   ├── mmsci.py
        │   ├── mmsci4eval_req.txt
        │   ├── mmvet.py
        │   ├── moviechat1k.py
        │   ├── multiple_choice.py
        │   ├── mvbench.py
        │   ├── naturalbench.py
        │   ├── ocr_reasoning.py
        │   ├── ocrbench.py
        │   ├── olympiadbench.py
        │   ├── physic.py
        │   ├── physics_eval_utils.py
        │   ├── phyx.py
        │   ├── qbench_video.py
        │   ├── qspatial.py
        │   ├── shortqa.py
        │   ├── spatial457.py
        │   ├── tablevqabench.py
        │   ├── tamperbench.py
        │   ├── tdbench.py
        │   ├── tempcompass.py
        │   ├── vdc.py
        │   ├── vgrpbench
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   └── formating-prompt
        │   │   │   │   ├── aquarium
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── battleships
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── binairo
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── coloredsudoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── fieldexplore
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── futoshiki
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── hitori
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── jigsawsudoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── kakurasu
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── kakuro
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── killersudoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── lightup
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── nonogram
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── oddevensudoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── renzoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── skyscraper
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── starbattle
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── sudoku
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   ├── thermometers
        │   │   │   │       └── filter_prompt.json
        │   │   │   │   └── treesandtents
        │   │   │   │       └── filter_prompt.json
        │   │   ├── evaluation.py
        │   │   ├── puzzles
        │   │   │   ├── aquarium.py
        │   │   │   ├── battleships.py
        │   │   │   ├── binairo.py
        │   │   │   ├── coloredsudoku.py
        │   │   │   ├── common_constriants.py
        │   │   │   ├── common_get_game_factory.py
        │   │   │   ├── common_get_prompt.py
        │   │   │   ├── common_puzzle_factory.py
        │   │   │   ├── fieldexplore.py
        │   │   │   ├── futoshiki.py
        │   │   │   ├── hitori.py
        │   │   │   ├── jigsawsudoku.py
        │   │   │   ├── kakurasu.py
        │   │   │   ├── kakuro.py
        │   │   │   ├── killersudoku.py
        │   │   │   ├── lightup.py
        │   │   │   ├── nonogram.py
        │   │   │   ├── oddevensudoku.py
        │   │   │   ├── renzoku.py
        │   │   │   ├── skyscraper.py
        │   │   │   ├── starbattle.py
        │   │   │   ├── sudoku.py
        │   │   │   ├── thermometers.py
        │   │   │   └── treesandtents.py
        │   │   └── score.py
        │   ├── video_mmlu.py
        │   ├── videomme.py
        │   ├── visulogic.py
        │   ├── vlm2bench.py
        │   ├── vmcbench.py
        │   ├── vqa_eval.py
        │   ├── wemath.py
        │   ├── worldsense.py
        │   └── yorn.py
        ├── vcr.py
        ├── vdc.py
        ├── video_base.py
        ├── video_concat_dataset.py
        ├── video_dataset_config.py
        ├── video_mmlu.py
        ├── videomme.py
        ├── vl_rewardbench.py
        ├── vlm2bench.py
        ├── wildvision.py
        └── worldsense.py
    ├── inference.py
    ├── inference_mt.py
    ├── inference_video.py
    ├── smp
        ├── __init__.py
        ├── file.py
        ├── log.py
        ├── misc.py
        └── vlm.py
    ├── tools.py
    ├── utils
        ├── __init__.py
        ├── matching_util.py
        ├── mp_util.py
        └── result_transfer.py
    └── vlm
        ├── __init__.py
        ├── aki.py
        ├── aria.py
        ├── base.py
        ├── bunnyllama3.py
        ├── cambrian.py
        ├── chameleon.py
        ├── cogvlm.py
        ├── deepseek_vl.py
        ├── deepseek_vl2.py
        ├── eagle_x.py
        ├── emu.py
        ├── falcon_vlm.py
        ├── flash_vl.py
        ├── gemma.py
        ├── h2ovl_mississippi.py
        ├── idefics.py
        ├── instructblip.py
        ├── internvl
            ├── __init__.py
            ├── gui_template.yaml
            ├── internvl_chat.py
            └── utils.py
        ├── janus.py
        ├── kimi_vl.py
        ├── kosmos.py
        ├── llama4.py
        ├── llama_vision.py
        ├── llava
            ├── __init__.py
            ├── llava.py
            └── llava_xtuner.py
        ├── long_vita.py
        ├── mantis.py
        ├── mgm.py
        ├── minicpm_v.py
        ├── minigpt4.py
        ├── minimonkey.py
        ├── misc
            ├── blip2_instruct_vicuna13b.yaml
            ├── blip2_instruct_vicuna7b.yaml
            ├── minigpt4_13b_eval.yaml
            ├── minigpt4_7b_eval.yaml
            └── minigptv2_eval.yaml
        ├── mixsense.py
        ├── mmalaya.py
        ├── molmo.py
        ├── monkey.py
        ├── moondream.py
        ├── mplug_owl2.py
        ├── mplug_owl3.py
        ├── nvlm.py
        ├── ola
            ├── __init__.py
            ├── ola
            │   ├── arguments.py
            │   ├── constants.py
            │   ├── conversation.py
            │   ├── datasets
            │   │   ├── __init__.py
            │   │   └── preprocess.py
            │   ├── mm_utils.py
            │   ├── model
            │   │   ├── __init__.py
            │   │   ├── builder.py
            │   │   ├── language_model
            │   │   │   └── ola_qwen.py
            │   │   ├── multimodal_encoder
            │   │   │   ├── builder.py
            │   │   │   └── oryx_vit.py
            │   │   ├── multimodal_projector
            │   │   │   ├── builder.py
            │   │   │   └── pooler_projector.py
            │   │   ├── multimodal_resampler
            │   │   │   ├── builder.py
            │   │   │   └── perceiver.py
            │   │   ├── ola_arch.py
            │   │   ├── speech_encoder
            │   │   │   ├── beats
            │   │   │   │   ├── BEATs.py
            │   │   │   │   ├── Tokenizers.py
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── backbone.py
            │   │   │   │   ├── kaldi.py
            │   │   │   │   ├── modules.py
            │   │   │   │   └── quantizer.py
            │   │   │   ├── builder.py
            │   │   │   └── speech_encoder.py
            │   │   └── speech_projector
            │   │   │   ├── builder.py
            │   │   │   └── speech_projector.py
            │   └── utils.py
            └── ola_model.py
        ├── omchat.py
        ├── omnilmm.py
        ├── open_flamingo.py
        ├── ovis
            ├── __init__.py
            ├── ovis.py
            └── utils
            │   ├── __init__.py
            │   └── mdp3.py
        ├── pandagpt.py
        ├── parrot.py
        ├── phi3_vision.py
        ├── phi4_multimodal.py
        ├── pixtral.py
        ├── points.py
        ├── qh_360vl.py
        ├── qwen2_vl
            ├── __init__.py
            ├── model.py
            └── prompt.py
        ├── qwen_vl.py
        ├── rbdash.py
        ├── ristretto.py
        ├── ross.py
        ├── sail_vl.py
        ├── slime.py
        ├── smolvlm.py
        ├── transcore_m.py
        ├── ursa
            ├── __init__.py
            ├── ursa_chat.py
            └── ursa_model
            │   ├── __init__.py
            │   ├── clip_encoder.py
            │   ├── configuration_ursa.py
            │   ├── image_processing_vlm.py
            │   ├── modeling_ursa.py
            │   ├── processing_ursa.py
            │   ├── projector.py
            │   ├── sam.py
            │   └── siglip_vit.py
        ├── valley
            ├── __init__.py
            ├── requirements_valley.txt
            └── valley.py
        ├── video_llm
            ├── __init__.py
            ├── chat_uni_vi.py
            ├── configs
            │   ├── llama_vid
            │   │   └── processor
            │   │   │   └── clip-patch14-224
            │   │   │       ├── config.json
            │   │   │       └── preprocessor_config.json
            │   └── videochat2_hd.json
            ├── llama_vid.py
            ├── pllava.py
            ├── video_chatgpt.py
            ├── video_llava.py
            └── videochat2.py
        ├── vila.py
        ├── vintern_chat.py
        ├── visualglm.py
        ├── vita.py
        ├── vlaa_thinker.py
        ├── vlm_r1.py
        ├── vxverse.py
        ├── wemm.py
        ├── wethink_vl.py
        ├── xcomposer
            ├── __init__.py
            ├── sharecaptioner.py
            ├── xcomposer.py
            ├── xcomposer2.py
            ├── xcomposer2_4KHD.py
            └── xcomposer2d5.py
        ├── xgen_mm.py
        └── yi_vl.py


/.github/scripts/assert_score.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import ast
 3 | import json
 4 | import os
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def validate_scores(dataset_list, assert_score, model_name):
10 |     for dataset in dataset_list:
11 |         base_score = assert_score[dataset][model_name]
12 |         if dataset == "OCRBench_MINI":
13 |             score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json")
14 |             cur_score = 0
15 |             with open(score_file, "r") as f:
16 |                 total_score = json.load(f)
17 |                 cur_score = total_score["Final Score Norm"]
18 |             assert (
19 |                 abs(cur_score - float(base_score)) <= 0.01
20 |             ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
21 |         else:
22 |             score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv")
23 |             df = pd.read_csv(score_file)
24 |             cur_score = df["Overall"].iloc[0]
25 |             if dataset == "MMBench_V11_MINI":
26 |                 cur_score = df.loc[df["split"] == "dev", "Overall"].values
27 |             assert (
28 |                 abs(cur_score - float(base_score)) <= 0.01
29 |             ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
30 |         print(f"cur_score is {cur_score}, base_score is {base_score}")
31 | 
32 | 
33 | def parse_arguments():
34 |     parser = argparse.ArgumentParser(description="Validate model scores against csv/json data")
35 | 
36 |     parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets")
37 | 
38 |     parser.add_argument(
39 |         "--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}"
40 |     )
41 | 
42 |     parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate")
43 | 
44 |     return parser.parse_args()
45 | 
46 | 
47 | def main():
48 |     args = parse_arguments()
49 | 
50 |     try:
51 |         dataset_list = args.dataset.split()
52 |         base_score = ast.literal_eval(args.base_score)
53 |     except Exception as e:
54 |         print(f"Parameter parsing error: {str(e)}")
55 |         return
56 | 
57 |     validate_scores(dataset_list, base_score, args.model_name)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.10
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.10.15
18 |       - name: Install pre-commit hook
19 |         run: |
20 |           pip install pre-commit
21 |           pre-commit install
22 |       - name: Linting
23 |         run: pre-commit run --all-files
24 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-run-test.yml:
--------------------------------------------------------------------------------
 1 | name: pr_run_test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - "main"
 7 |     paths-ignore:
 8 |       - "docs/**"
 9 |       - "**.md"
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | env:
16 |   BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2-VL-7B-Instruct":0.8727272727272727,"InternVL2_5-8B":0.8727272727272727,"llava_onevision_qwen2_7b_si":0.8363636363636363},"MMStar_MINI":{"Qwen2-VL-7B-Instruct":0.6266666666666667,"InternVL2_5-8B":0.6333333333333333,"llava_onevision_qwen2_7b_si":0.49333333333333335},"AI2D_MINI":{"Qwen2-VL-7B-Instruct":0.7854251012145749,"InternVL2_5-8B":0.8421052631578947,"llava_onevision_qwen2_7b_si":0.8178137651821862},"OCRBench_MINI":{"Qwen2-VL-7B-Instruct":16.6,"InternVL2_5-8B":16.4,"llava_onevision_qwen2_7b_si":12.9}}'
17 | 
18 | jobs:
19 |   vlm_test:
20 |     if: ${{!cancelled()}}
21 |     runs-on: [linux-a100]
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         model: [Qwen/Qwen2-VL-7B-Instruct,OpenGVLab/InternVL2_5-8B,lmms-lab/llava-onevision-qwen2-7b-si]
26 |         dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
27 |     container:
28 |       image: kkscilife/vlmevalkit_2:a100
29 |       options: "--gpus=all --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy --pull never"
30 |       volumes:
31 |         - /mnt/187:/mnt/187
32 |     steps:
33 |       - name: clone_repo
34 |         uses: actions/checkout@v3
35 |       - name: evaluation_model
36 |         run: |
37 |           pip install -e .
38 |           pre_model=$(echo ${{matrix.model}} | awk -F'/' '{print $1}')
39 |           ln -s /mnt/187/$pre_model .
40 |           if [ "${{matrix.model}}" = "lmms-lab/llava-onevision-qwen2-7b-si" ];then
41 |               model_name="llava_onevision_qwen2_7b_si"
42 |           else
43 |               model_name=$(echo ${{matrix.model}} | awk -F'/' '{print $2}')
44 |           fi
45 |           nvidia-smi
46 |           python run.py --data ${{matrix.dataset}} --model $model_name
47 |           python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name $model_name
48 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: |
 2 |   (?x)^(
 3 |       scripts/|
 4 |       assets/|
 5 |       vlmeval/config.py |
 6 |       vlmeval/dataset/utils/wemath.py |
 7 |       vlmeval/dataset/Omnidocbench/ |
 8 |       vlmeval/dataset/utils/megabench/ |
 9 |       vlmeval/dataset/utils/vgrpbench/ |
10 |       vlmeval/vlm/ola/ |
11 |       vlmeval/vlm/ursa/ |
12 |       vlmeval/vlm/ovis/
13 |   )
14 | repos:
15 |   - repo: https://github.com/PyCQA/flake8
16 |     rev: 6.1.0
17 |     hooks:
18 |       - id: flake8
19 |         args:
20 |           [
21 |             "--max-line-length=120",
22 |             "--ignore=F401,F403,F405,E402,E722,E741,W503,E231,E702",
23 |           ]
24 |         exclude: ^configs/
25 |   - repo: https://github.com/pre-commit/mirrors-yapf
26 |     rev: v0.30.0
27 |     hooks:
28 |       - id: yapf
29 |         args: ["--style={column_limit=120}"]
30 |   - repo: https://github.com/pre-commit/pre-commit-hooks
31 |     rev: v3.1.0
32 |     hooks:
33 |       - id: trailing-whitespace
34 |       - id: check-yaml
35 |       - id: end-of-file-fixer
36 |       - id: requirements-txt-fixer
37 |       - id: check-merge-conflict
38 |       - id: fix-encoding-pragma
39 |         args: ["--remove"]
40 |       - id: mixed-line-ending
41 |         args: ["--fix=lf"]
42 | 


--------------------------------------------------------------------------------
/assets/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/assets/apple.jpg


--------------------------------------------------------------------------------
/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-22.04
 6 |   tools:
 7 |     python: "3.8"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/en/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 | 


--------------------------------------------------------------------------------
/docs/en/Contributors.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | ## Contributors w. 3+ Major Contributions
 4 | 
 5 | > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit.
 6 | 
 7 | New Qualified Contributors (2024.09):
 8 | 
 9 | 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM
10 | 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.)
11 | 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit
12 | 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE
13 | 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench
14 | 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID
15 | 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5)
16 | 
17 | ## Full Contributor List
18 | 
19 | > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit.
20 | 
21 | TBD.
22 | 


--------------------------------------------------------------------------------
/docs/en/EvalByLMDeploy.md:
--------------------------------------------------------------------------------
 1 | # Using LMDeploy to Accelerate Evaluation and Inference
 2 | 
 3 | VLMEvalKit supports testing VLM models deployed by LMDeploy. Below, we use InternVL2-8B as an example to show how to test the model.
 4 | 
 5 | ## Step 0: Install LMDeploy
 6 | 
 7 | ```bash
 8 | pip install lmdeploy
 9 | ```
10 | For other installation methods, you can refer to LMDeploy's [documentation](https://github.com/InternLM/lmdeploy).
11 | 
12 | ## Step 1: Start the Inference Service
13 | 
14 | ```bash
15 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B
16 | ```
17 | > [!IMPORTANT]
18 | > Since models in VLMEvalKit may have custom behaviors when building prompts for different datasets, such as InternVL2's handling of HallusionBench, it is necessary to specify `--model-name` when starting the server. This allows the VLMEvalKit to select appropriate prompt construction strategy based on the name when using the LMDeploy API.
19 | >
20 | > If `--server-port`, is specified, the corresponding environment variable `LMDEPLOY_API_BASE` needs to be set.
21 | 
22 | 
23 | ## Step 2: Evaluation
24 | 
25 | ```bash
26 | python run.py --data MMStar --model lmdeploy --verbose --api-nproc 64
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/en/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/en/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | .header-logo {
 2 |   background-image: url("../image/logo.svg");
 3 |   background-size: 275px 80px;
 4 |   height: 80px;
 5 |   width: 275px;
 6 | }
 7 | 
 8 | 
 9 | @media screen and (min-width: 1100px) {
10 |   .header-logo {
11 |     top: -25px;
12 |   }
13 | }
14 | 
15 | pre {
16 |     white-space: pre;
17 | }
18 | 
19 | @media screen and (min-width: 2000px) {
20 |   .pytorch-content-left {
21 |     width: 1200px;
22 |     margin-left: 30px;
23 |   }
24 |   article.pytorch-article {
25 |     max-width: 1200px;
26 |   }
27 |   .pytorch-breadcrumbs-wrapper {
28 |     width: 1200px;
29 |   }
30 |   .pytorch-right-menu.scrolling-fixed {
31 |     position: fixed;
32 |     top: 45px;
33 |     left: 1580px;
34 |   }
35 | }
36 | 
37 | 
38 | article.pytorch-article section code {
39 |   padding: .2em .4em;
40 |   background-color: #f3f4f7;
41 |   border-radius: 5px;
42 | }
43 | 
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 |   padding: unset;
47 |   background-color: unset;
48 |   border-radius: unset;
49 | }
50 | 
51 | table.autosummary td {
52 |   width: 50%
53 | }
54 | 
55 | img.align-center {
56 |   display: block;
57 |   margin-left: auto;
58 |   margin-right: auto;
59 | }
60 | 
61 | article.pytorch-article p.rubric {
62 |   font-weight: bold;
63 | }
64 | 


--------------------------------------------------------------------------------
/docs/en/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
 3 |   <defs>
 4 |     <style>
 5 |       .cls-1 {
 6 |         fill: #36569b;
 7 |       }
 8 | 
 9 |       .cls-2 {
10 |         fill: #1b3882;
11 |       }
12 | 
13 |       .cls-3 {
14 |         fill: #5878b4;
15 |       }
16 |     </style>
17 |   </defs>
18 |   <g id="_图层_1-2" data-name="图层 1">
19 |     <g>
20 |       <g id="_3" data-name="3">
21 |         <path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
22 |       </g>
23 |       <g id="_2" data-name="2">
24 |         <path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
25 |       </g>
26 |       <g id="_1" data-name="1">
27 |         <path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
28 |       </g>
29 |     </g>
30 |   </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/en/_static/js/custom.js:
--------------------------------------------------------------------------------
 1 | var collapsedSections = [];
 2 | 
 3 | $(document).ready(function () {
 4 |   $('.model-summary').DataTable({
 5 |     "stateSave": false,
 6 |     "lengthChange": false,
 7 |     "pageLength": 20,
 8 |     "order": []
 9 |   });
10 | });
11 | 


--------------------------------------------------------------------------------
/docs/en/_templates/404.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | 
 3 | {% block body %}
 4 | 
 5 | <h1>Page Not Found</h1>
 6 | <p>
 7 |   The page you are looking for cannot be found.
 8 | </p>
 9 | <p>
10 |   If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 |   the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12 | </p>
13 | <!-- <p>
14 |   If you cannot find documentation you want, please <a
15 |     href="">open an issue</a> to tell us!
16 | </p> -->
17 | 
18 | {% endblock %}
19 | 


--------------------------------------------------------------------------------
/docs/en/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | ..
12 |   autogenerated from _templates/autosummary/class.rst
13 |   note it does not have :inherited-members:
14 | 


--------------------------------------------------------------------------------
/docs/en/_templates/callable.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 |     :special-members: __call__
11 | 
12 | ..
13 |   autogenerated from _templates/callable.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/docs/en/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 | 


--------------------------------------------------------------------------------
/docs/en/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to the VLMEvalKit Tutorial!
 2 | ==========================================
 3 | 
 4 | VLMEvalKit Getting Started Guide
 5 | -------------------------------
 6 | 
 7 | To help users get started quickly, we recommend the following process:
 8 | 
 9 | - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process.
10 | 
11 | - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial."
12 | 
13 | We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit!
14 | 
15 | .. _Start Your First Step:
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: Start Your First Step
19 | 
20 |    Quickstart.md
21 | 
22 | .. _Advanced Tutorial:
23 | .. toctree::
24 |    :maxdepth: 1
25 |    :caption: Advanced Tutorial
26 | 
27 |    Development.md
28 |    ConfigSystem.md
29 | 
30 | .. _Other Notes:
31 | .. toctree::
32 |    :maxdepth: 1
33 |    :caption: Other Notes
34 | 
35 |    Contributors.md
36 | 
37 | Index and Tables
38 | ==================
39 | 
40 | * :ref:`genindex`
41 | * :ref:`search`
42 | 


--------------------------------------------------------------------------------
/docs/zh-CN/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-22.04
 6 |   tools:
 7 |     python: "3.8"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/zh-CN/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 | 


--------------------------------------------------------------------------------
/docs/zh-CN/ConfigSystem.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 配置系统
 3 | 
 4 | 默认情况下，VLMEvalKit通过在`run.py`脚本中使用`--model`和`--data`参数设置模型名称（在`/vlmeval/config.py`中定义）和数据集名称（在`vlmeval/dataset/__init__.py` 或 `vlmeval/dataset/video_dataset_config.py` 中定义）来启动评估。这种方法在大多数情况下简单且高效，但当用户希望使用不同设置评估多个模型/数据集时，可能不够灵活。
 5 | 
 6 | 为了解决这个问题，VLMEvalKit提供了一个更灵活的配置系统。用户可以在json文件中指定模型和数据集设置，并通过`--config`参数将配置文件的路径传递给`run.py`脚本。以下是一个示例配置json：
 7 | 
 8 | ```json
 9 | {
10 |     "model": {
11 |         "GPT4o_20240806_T00_HIGH": {
12 |             "class": "GPT4V",
13 |             "model": "gpt-4o-2024-08-06",
14 |             "temperature": 0,
15 |             "img_detail": "high"
16 |         },
17 |         "GPT4o_20240806_T10_Low": {
18 |             "class": "GPT4V",
19 |             "model": "gpt-4o-2024-08-06",
20 |             "temperature": 1.0,
21 |             "img_detail": "low"
22 |         },
23 |         "GPT4o_20241120": {}
24 |     },
25 |     "data": {
26 |         "MME-RealWorld-Lite": {
27 |             "class": "MMERealWorld",
28 |             "dataset": "MME-RealWorld-Lite"
29 |         },
30 |         "MMBench_DEV_EN_V11": {
31 |             "class": "ImageMCQDataset",
32 |             "dataset": "MMBench_DEV_EN_V11"
33 |         },
34 |         "MMBench_Video_8frame_nopack":{},
35 |         "Video-MME_16frame_subs": {
36 |             "class": "VideoMME",
37 |             "dataset": "Video-MME",
38 |             "nframe": 16,
39 |             "use_subtitle": true
40 |         }
41 |     }
42 | }
43 | ```
44 | 
45 | 配置json的解释：
46 | 
47 | 1. 现在我们支持两个字段：`model`和`data`，每个字段都是一个字典。字典的键是模型/数据集的名称（由用户设置），值是模型/数据集的设置。
48 | 2. 对于`model`中的项目，值是一个包含以下键的字典：
49 |     - `class`：模型的类名，应该是`vlmeval/vlm/__init__.py`（开源模型）或`vlmeval/api/__init__.py`（API模型）中定义的类名。
50 |     - 其他kwargs：其他kwargs是模型特定的参数，请参考模型类的定义以获取详细用法。例如，`model`、`temperature`、`img_detail`是`GPT4V`类的参数。值得注意的是，大多数模型类都需要`model`参数。
51 |     - Tip：在位于`vlmeval/config.py`的变量`supported_VLM`中的已经被定义的模型可以作为`model`的键，而不需要填对应的值即可启动。例如，`GPT4o_20240806_T00_HIGH: {}`是等价于`GPT4o_20240806_T00_HIGH: {'class': 'GPT4V', 'model': 'gpt-4o-2024-08-06', 'temperature': 0, 'img_size': -1, 'img_detail': 'high', 'retry': 10, 'verbose': False}`。
52 | 3. 对于字典`data`，我们建议用户使用官方数据集名称作为键（或键的一部分），因为我们经常根据数据集名称确定后处理/判断设置。对于`data`中的项目，值是一个包含以下键的字典：
53 |     - `class`：数据集的类名，应该是`vlmeval/dataset/__init__.py`中定义的类名。
54 |     - 其他kwargs：其他kwargs是数据集特定的参数，请参考数据集类的定义以获取详细用法。通常，大多数数据集类都需要`dataset`参数。大多数视频数据集类都需要 `nframe` 或 `fps` 参数。
55 |     - Tip：在位于`vlmeval/dataset/video_dataset_config.py`的变量`supported_video_dataset`中的已经被定义的数据集可以作为`data`的键，而不需要填对应的值即可启动。例如，`MMBench_Video_8frame_nopack: {}`是等价于`MMBench_Video_8frame_nopack: {'class': 'MMBenchVideo', 'dataset': 'MMBench-Video', 'nframe': 8, 'pack': False}`。
56 | 
57 | 将示例配置json保存为`config.json`，您可以通过以下命令启动评估：
58 | 
59 | ```bash
60 | python run.py --config config.json
61 | ```
62 | 
63 | 这将在工作目录`$WORK_DIR`下生成以下输出文件（格式为`{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`）：
64 | 
65 | - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*`
66 | - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*`
67 | - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*`
68 | - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*`
69 | ......
70 | 


--------------------------------------------------------------------------------
/docs/zh-CN/EvalByLMDeploy.md:
--------------------------------------------------------------------------------
 1 | # 使用 LMDeploy 加速评测推理
 2 | 
 3 | VLMEvalKit 支持测试由 LMDeploy 部署的 VLM 模型，下面以 InternVL2-8B 为例，展示如何测试模型
 4 | 
 5 | ## 第0步 安装 LMDeploy
 6 | 
 7 | ```bash
 8 | pip install lmdeploy
 9 | ```
10 | 
11 | 其他安装方式可以参考 LMDeploy 的[文档](https://github.com/InternLM/lmdeploy)
12 | 
13 | ## 第1步 启动推理服务
14 | 
15 | ```bash
16 | lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B
17 | ```
18 | > [!IMPORTANT]
19 | > 因为 VLMEvalKit 中的模型对于不同数据集在构建 prompt 时可能有自定义行为，如 InternVL2 对于 HallusionBench 的处理，所以，server 端在启动的时候需要指定 `--model-name`，这样在使用 LMDEploy api 时可以根据名字选择合适的 prompt 构建策略。
20 | >
21 | > 如果指定了 `--server-port`，需要设置对应的环境变量 `LMDEPLOY_API_BASE`
22 | 
23 | 
24 | ## 第2步 评测
25 | 
26 | ```bash
27 | python run.py --data MMStar --model InternVL2-8B --verbose --api-nproc 64
28 | ```
29 | 


--------------------------------------------------------------------------------
/docs/zh-CN/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | .header-logo {
 2 |   background-image: url("../image/logo.svg");
 3 |   background-size: 275px 80px;
 4 |   height: 80px;
 5 |   width: 275px;
 6 | }
 7 | 
 8 | 
 9 | @media screen and (min-width: 1100px) {
10 |   .header-logo {
11 |     top: -25px;
12 |   }
13 | }
14 | 
15 | pre {
16 |     white-space: pre;
17 | }
18 | 
19 | @media screen and (min-width: 2000px) {
20 |   .pytorch-content-left {
21 |     width: 1200px;
22 |     margin-left: 30px;
23 |   }
24 |   article.pytorch-article {
25 |     max-width: 1200px;
26 |   }
27 |   .pytorch-breadcrumbs-wrapper {
28 |     width: 1200px;
29 |   }
30 |   .pytorch-right-menu.scrolling-fixed {
31 |     position: fixed;
32 |     top: 45px;
33 |     left: 1580px;
34 |   }
35 | }
36 | 
37 | 
38 | article.pytorch-article section code {
39 |   padding: .2em .4em;
40 |   background-color: #f3f4f7;
41 |   border-radius: 5px;
42 | }
43 | 
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 |   padding: unset;
47 |   background-color: unset;
48 |   border-radius: unset;
49 | }
50 | 
51 | table.autosummary td {
52 |   width: 50%
53 | }
54 | 
55 | img.align-center {
56 |   display: block;
57 |   margin-left: auto;
58 |   margin-right: auto;
59 | }
60 | 
61 | article.pytorch-article p.rubric {
62 |   font-weight: bold;
63 | }
64 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg id="_图层_2" data-name="图层 2" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 34.59 36">
 3 |   <defs>
 4 |     <style>
 5 |       .cls-1 {
 6 |         fill: #36569b;
 7 |       }
 8 | 
 9 |       .cls-2 {
10 |         fill: #1b3882;
11 |       }
12 | 
13 |       .cls-3 {
14 |         fill: #5878b4;
15 |       }
16 |     </style>
17 |   </defs>
18 |   <g id="_图层_1-2" data-name="图层 1">
19 |     <g>
20 |       <g id="_3" data-name="3">
21 |         <path class="cls-3" d="m16.53,22.65l-6.37,3.07,5.27-.16,1.1-2.91Zm-4.19,10.95l1.12-2.91-5.27.17,4.15,2.74Zm9.3-.29l6.37-3.07-5.27.16-1.1,2.91Zm4.19-10.95l-1.12,2.91,5.27-.17-4.15-2.74Zm5.72,3.81l-7.08.23-1.73-1.14,1.5-3.95-2.06-1.36-3.16,1.53-1.48,3.89-2.67,1.29-7.14.23-3.16,1.53,2.07,1.36,7.13-.23h0s1.69,1.11,1.69,1.11l-1.51,3.98,2.06,1.36,3.16-1.53,1.5-3.95h0s2.56-1.24,2.56-1.24h0s7.23-.24,7.23-.24l3.16-1.53-2.06-1.36Zm-11.29,2.56c-.99.48-2.31.52-2.96.1-.65-.42-.37-1.15.62-1.63.99-.48,2.31-.52,2.96-.1.65.42.37,1.15-.62,1.63Z"/>
22 |       </g>
23 |       <g id="_2" data-name="2">
24 |         <path class="cls-1" d="m33.5,19.84l-1.26-6.51-1.46,1.88,2.72,4.63Zm-6.05-14.69l-4.16-2.74,2.71,4.64,1.45-1.89Zm-6.73.58l1.26,6.51,1.46-1.88-2.72-4.63Zm6.05,14.69l4.16,2.74-2.71-4.64-1.45,1.89Zm7.19,1.91l-3.63-6.2h0s-.53-2.74-.53-2.74l1.96-2.56-.63-3.23-2.07-1.36-1.96,2.56-1.69-1.11-3.71-6.33-2.07-1.36.63,3.23,3.68,6.28h0s.51,2.62.51,2.62h0s-1.99,2.6-1.99,2.6l.63,3.23,2.06,1.36,1.95-2.54,1.73,1.14,3.69,6.29,2.07,1.36-.63-3.23Zm-6.47-7.7c-.65-.42-1.33-1.59-1.52-2.6-.2-1.01.17-1.49.81-1.06.65.42,1.33,1.59,1.52,2.6.2,1.01-.17,1.49-.81,1.06Z"/>
25 |       </g>
26 |       <g id="_1" data-name="1">
27 |         <path class="cls-2" d="m11.96,2.82l-6.37,3.07,3.81,1.74,2.55-4.81ZM1.07,14.37l1.26,6.53,2.56-4.8-3.82-1.73Zm7.99,9.59l6.37-3.07-3.81-1.74-2.55,4.81Zm10.89-11.55l-1.26-6.53-2.56,4.8,3.82,1.73Zm.45,2.53l-5.13-2.32h0s-.53-2.71-.53-2.71l3.47-6.53-.63-3.24-3.16,1.53-3.42,6.43-2.67,1.29h0s-5.17-2.34-5.17-2.34l-3.16,1.53.63,3.24,5.17,2.33.51,2.65h0s-3.49,6.57-3.49,6.57l.63,3.24,3.16-1.53,3.46-6.52,2.56-1.24h0s5.24,2.37,5.24,2.37l3.16-1.53-.63-3.24Zm-9.52.24c-.99.48-1.95.04-2.14-.97-.2-1.01.44-2.22,1.43-2.69.99-.48,1.95-.04,2.14.97.2,1.01-.44,2.22-1.43,2.7Z"/>
28 |       </g>
29 |     </g>
30 |   </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_static/js/custom.js:
--------------------------------------------------------------------------------
 1 | var collapsedSections = [];
 2 | 
 3 | $(document).ready(function () {
 4 |   $('.model-summary').DataTable({
 5 |     "stateSave": false,
 6 |     "lengthChange": false,
 7 |     "pageLength": 20,
 8 |     "order": []
 9 |   });
10 | });
11 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/404.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | 
 3 | {% block body %}
 4 | 
 5 | <h1>Page Not Found</h1>
 6 | <p>
 7 |   The page you are looking for cannot be found.
 8 | </p>
 9 | <p>
10 |   If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 |   the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12 | </p>
13 | <!-- <p>
14 |   If you cannot find documentation you want, please <a
15 |     href="">open an issue</a> to tell us!
16 | </p> -->
17 | 
18 | {% endblock %}
19 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 | 
11 | ..
12 |   autogenerated from _templates/autosummary/class.rst
13 |   note it does not have :inherited-members:
14 | 


--------------------------------------------------------------------------------
/docs/zh-CN/_templates/callable.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | 
 6 | {{ name | underline}}
 7 | 
 8 | .. autoclass:: {{ name }}
 9 |     :members:
10 |     :special-members: __call__
11 | 
12 | ..
13 |   autogenerated from _templates/callable.rst
14 |   note it does not have :inherited-members:
15 | 


--------------------------------------------------------------------------------
/docs/zh-CN/cp_origin_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copy *.md files from docs/ if it doesn't have a Chinese translation
 4 | 
 5 | for filename in $(find ../en/ -name '*.md' -printf "%P\n");
 6 | do
 7 |     mkdir -p $(dirname $filename)
 8 |     cp -n ../en/$filename ./$filename
 9 | done
10 | 


--------------------------------------------------------------------------------
/docs/zh-CN/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 | 


--------------------------------------------------------------------------------
/docs/zh-CN/index.rst:
--------------------------------------------------------------------------------
 1 | 欢迎来到 VLMEvalKit 中文教程！
 2 | ==========================================
 3 | 
 4 | VLMEvalKit 上手路线
 5 | -------------------------------
 6 | 
 7 | 为了用户能够快速上手，我们推荐以下流程：
 8 | 
 9 | - 对于想要使用 VLMEvalKit 的用户，我们推荐先阅读 开始你的第一步_ 部分来设置环境，并启动一个迷你实验熟悉流程。
10 | 
11 | - 若您想进行更多模块的自定义，例如增加数据集和模型，我们提供了 进阶教程_ 。
12 | 
13 | 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit！
14 | 
15 | .. _快速开始:
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: 快速开始
19 | 
20 |    Quickstart.md
21 | 
22 | 
23 | .. .. _教程:
24 | .. .. toctree::
25 | ..    :maxdepth: 1
26 | ..    :caption: 教程
27 | 
28 | ..    user_guides/framework_overview.md
29 | 
30 | .. _进阶教程:
31 | .. toctree::
32 |    :maxdepth: 1
33 |    :caption: 进阶教程
34 | 
35 |    Development.md
36 |    ConfigSystem.md
37 | 
38 | .. .. _其他说明:
39 | .. .. toctree::
40 | ..    :maxdepth: 1
41 | ..    :caption: 其他说明
42 | 
43 | ..    notes/contribution_guide.md
44 | 
45 | 索引与表格
46 | ==================
47 | 
48 | * :ref:`genindex`
49 | * :ref:`search`
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | dotenv
 3 | einops
 4 | # for gemini api
 5 | google-genai
 6 | gradio
 7 | huggingface_hub
 8 | imageio
 9 | ipdb
10 | json_repair
11 | matplotlib
12 | nltk
13 | numpy
14 | omegaconf
15 | openai
16 | opencv-python>=4.4.0.46
17 | openpyxl
18 | pandas
19 | pillow
20 | portalocker
21 | protobuf
22 | python-dotenv
23 | qwen_vl_utils
24 | requests
25 | rich
26 | sentencepiece
27 | setuptools
28 | sty
29 | tabulate
30 | tiktoken
31 | timeout-decorator
32 | timm
33 | torch
34 | torchvision
35 | tqdm
36 | transformers
37 | typing_extensions
38 | validators
39 | xlsxwriter
40 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
 1 | docutils==0.18.1
 2 | modelindex
 3 | myst-parser
 4 | -e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 5 | sphinx==6.1.3
 6 | sphinx-copybutton
 7 | sphinx-design
 8 | sphinx-notfound-page
 9 | sphinx-tabs
10 | sphinxcontrib-jquery
11 | tabulate
12 | 


--------------------------------------------------------------------------------
/scripts/apires_scan.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from vlmeval import *
 3 | from vlmeval.dataset import SUPPORTED_DATASETS
 4 | FAIL_MSG = 'Failed to obtain answer via API.'
 5 | 
 6 | root = sys.argv[1]
 7 | if root[-1] in '/\\':
 8 |     root = root[:-1]
 9 | 
10 | model_name = root.split('/')[-1]
11 | 
12 | for d in SUPPORTED_DATASETS:
13 |     fname = f'{model_name}_{d}.xlsx'
14 |     pth = osp.join(root, fname)
15 |     if osp.exists(pth):
16 |         data = load(pth)
17 |         # Detect Failure
18 |         assert 'prediction' in data
19 |         data['prediction'] = [str(x) for x in data['prediction']]
20 |         fail = [FAIL_MSG in x for x in data['prediction']]
21 |         if sum(fail):
22 |             nfail = sum(fail)
23 |             ntot = len(fail)
24 |             print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
25 | 
26 |         eval_files = ls(root, match=f'{model_name}_{d}_')
27 |         eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')]
28 | 
29 |         if len(eval_files) == 0:
30 |             print(f'Model {model_name} x Dataset {d} openai missing')
31 |             continue
32 |         
33 |         assert len(eval_files) == 1
34 |         eval_file = eval_files[0]
35 |         data = load(eval_file)
36 |         
37 |         if 'MMVet' in d:
38 |             bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
39 |             if len(bad):
40 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
41 |         elif 'MathVista' in d:
42 |             bad = [x for x in data['res'] if FAIL_MSG in str(x)]
43 |             if len(bad):
44 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
45 |             
46 |         elif d == 'LLaVABench':
47 |             sub = data[data['gpt4_score'] == -1]
48 |             sub = sub[sub['gpt4_score'] == -1]
49 |             if len(sub):
50 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
51 |         else:
52 |             bad = [x for x in data['log'] if FAIL_MSG in str(x)]
53 |             if len(bad):
54 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
55 |                 


--------------------------------------------------------------------------------
/scripts/auto_run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from vlmeval.smp import *
 3 | from vlmeval.config import supported_VLM
 4 | 
 5 | def is_api(x):
 6 |     return getattr(supported_VLM[x].func, 'is_api', False)
 7 | 
 8 | models = list(supported_VLM)
 9 | models = [x for x in models if 'fs' not in x]
10 | models = [x for x in models if not is_api(x)]
11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2']
12 | models = [x for x in models if x not in exclude_list]
13 | 
14 | def is_large(x):
15 |     return '80b' in x or 'emu2' in x or '34B' in x
16 | 
17 | small_models = [x for x in models if not is_large(x)]
18 | large_models = [x for x in models if is_large(x)]
19 | models = small_models + large_models
20 | 
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--data', type=str, nargs='+', required=True)
23 | args = parser.parse_args()
24 | 
25 | # Skip some models
26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
27 | 
28 | for m in models:
29 |     unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
30 |     if len(unknown_datasets) == 0:
31 |         continue
32 |     dataset_str = ' '.join(unknown_datasets)
33 |     if '80b' in m:
34 |         cmd = f'python run.py --data {dataset_str} --model {m}'
35 |     else:
36 |         cmd = f'bash run.sh --data {dataset_str} --model {m}'
37 |     print(cmd)
38 |     os.system(cmd)


--------------------------------------------------------------------------------
/scripts/cover.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3 | cp $DIR/../config.py $DIR/../vlmeval/
4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/


--------------------------------------------------------------------------------
/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | export GPU=$(nvidia-smi --list-gpus | wc -l)
4 | torchrun --nproc-per-node=$GPU run.py ${@:1}


--------------------------------------------------------------------------------
/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}


--------------------------------------------------------------------------------
/vlmeval/__init__.py:
--------------------------------------------------------------------------------
 1 | import ssl
 2 | ssl._create_default_https_context = ssl._create_unverified_context
 3 | # Temporarily bypass SSL certificate verification to download files from oss.
 4 | 
 5 | try:
 6 |     import torch
 7 | except ImportError:
 8 |     pass
 9 | 
10 | from .smp import *
11 | from .api import *
12 | from .dataset import *
13 | from .utils import *
14 | from .vlm import *
15 | from .config import *
16 | from .tools import cli
17 | 
18 | load_env()
19 | 
20 | __version__ = '0.2rc1'
21 | 


--------------------------------------------------------------------------------
/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gpt import OpenAIWrapper, GPT4V
 2 | from .hf_chat_model import HFChatModel
 3 | from .gemini import GeminiWrapper, Gemini
 4 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
 5 | from .qwen_api import QwenAPI
 6 | from .claude import Claude_Wrapper, Claude3V
 7 | from .reka import Reka
 8 | from .glm_vision import GLMVisionAPI
 9 | from .cloudwalk import CWWrapper
10 | from .sensechat_vision import SenseChatVisionAPI
11 | from .siliconflow import SiliconFlowAPI, TeleMMAPI
12 | from .hunyuan import HunyuanVision
13 | from .bailingmm import bailingMMAPI
14 | from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
15 | from .jt_vl_chat import JTVLChatAPI
16 | from .taiyi import TaiyiAPI
17 | from .lmdeploy import LMDeployAPI
18 | from .taichu import TaichuVLAPI, TaichuVLRAPI
19 | from .doubao_vl_api import DoubaoVL
20 | from .mug_u import MUGUAPI
21 | 
22 | __all__ = [
23 |     'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini',
24 |     'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper',
25 |     'Reka', 'GLMVisionAPI', 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision',
26 |     'Qwen2VLAPI', 'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI',
27 |     'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI', 'LMDeployAPI',
28 |     'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI"
29 | ]
30 | 


--------------------------------------------------------------------------------
/vlmeval/api/glm_vision.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | requests.packages.urllib3.disable_warnings()
 3 | 
 4 | from vlmeval.smp import *
 5 | from vlmeval.api.base import BaseAPI
 6 | from vlmeval.dataset import DATASET_TYPE
 7 | from vlmeval.smp.vlm import encode_image_file_to_base64
 8 | 
 9 | 
10 | class GLMVisionWrapper(BaseAPI):
11 | 
12 |     is_api: bool = True
13 | 
14 |     def __init__(self,
15 |                  model: str,
16 |                  retry: int = 5,
17 |                  wait: int = 5,
18 |                  key: str = None,
19 |                  verbose: bool = True,
20 |                  system_prompt: str = None,
21 |                  max_tokens: int = 4096,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 | 
25 |         from zhipuai import ZhipuAI
26 |         self.model = model
27 |         self.fail_msg = 'Failed to obtain answer via API. '
28 |         if key is None:
29 |             key = os.environ.get('GLMV_API_KEY', None)
30 |         assert key is not None, (
31 |             'Please set the API Key (obtain it here: '
32 |             'https://bigmodel.cn)'
33 |         )
34 |         self.client = ZhipuAI(api_key=key)
35 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
36 | 
37 |     def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
38 |         msgs = cp.deepcopy(msgs_raw)
39 |         content = []
40 |         for i, msg in enumerate(msgs):
41 |             if msg['type'] == 'text':
42 |                 content.append(dict(type='text', text=msg['value']))
43 |             elif msg['type'] == 'image':
44 |                 content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
45 |         if dataset in {'HallusionBench', 'POPE'}:
46 |             content.append(dict(type="text", text="Please answer yes or no."))
47 |         ret = [dict(role='user', content=content)]
48 |         return ret
49 | 
50 |     def generate_inner(self, inputs, **kwargs) -> str:
51 |         assert isinstance(inputs, str) or isinstance(inputs, list)
52 |         inputs = [inputs] if isinstance(inputs, str) else inputs
53 | 
54 |         messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
55 | 
56 |         response = self.client.chat.completions.create(
57 |             model=self.model,
58 |             messages=messages,
59 |             do_sample=False,
60 |             max_tokens=2048
61 |         )
62 |         try:
63 |             answer = response.choices[0].message.content.strip()
64 |             if self.verbose:
65 |                 self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
66 |             return 0, answer, 'Succeeded!'
67 |         except Exception as err:
68 |             if self.verbose:
69 |                 self.logger.error(f'{type(err)}: {err}')
70 |                 self.logger.error(f'The input messages are {inputs}.')
71 |             return -1, self.fail_msg, ''
72 | 
73 | 
74 | class GLMVisionAPI(GLMVisionWrapper):
75 | 
76 |     def generate(self, message, dataset=None):
77 |         return super(GLMVisionAPI, self).generate(message, dataset=dataset)
78 | 


--------------------------------------------------------------------------------
/vlmeval/api/qwen_api.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | import os
 3 | from vlmeval.api.base import BaseAPI
 4 | from vlmeval.smp import *
 5 | 
 6 | 
 7 | # Note: This is a pure language model API.
 8 | class QwenAPI(BaseAPI):
 9 | 
10 |     is_api: bool = True
11 | 
12 |     def __init__(self,
13 |                  model: str = 'qwen-max-1201',
14 |                  retry: int = 5,
15 |                  wait: int = 5,
16 |                  verbose: bool = True,
17 |                  seed: int = 2680,
18 |                  temperature: float = 0.0,
19 |                  system_prompt: str = None,
20 |                  key: str = None,
21 |                  max_tokens: int = 2048,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 | 
25 |         assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26 |         self.model = model
27 |         import dashscope
28 |         self.fail_msg = 'Failed to obtain answer via API. '
29 |         self.max_tokens = max_tokens
30 |         self.temperature = temperature
31 |         self.seed = seed
32 |         if key is None:
33 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
34 |         assert key is not None, (
35 |             'Please set the API Key (obtain it here: '
36 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37 |         )
38 |         dashscope.api_key = key
39 |         if proxy is not None:
40 |             proxy_set(proxy)
41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 | 
43 |     @staticmethod
44 |     def build_msgs(msgs_raw, system_prompt=None):
45 |         msgs = cp.deepcopy(msgs_raw)
46 |         ret = []
47 |         if system_prompt is not None:
48 |             ret.append(dict(role='system', content=system_prompt))
49 |         for i, msg in enumerate(msgs):
50 |             role = 'user' if i % 2 == 0 else 'assistant'
51 |             ret.append(dict(role=role, content=msg))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         inputs = [inputs] if isinstance(inputs, str) else inputs
58 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59 | 
60 |         import dashscope
61 |         response = dashscope.Generation.call(
62 |             model=self.model,
63 |             messages=messages,
64 |             seed=self.seed,
65 |             temperature=self.temperature,
66 |             max_tokens=self.max_tokens,
67 |             result_format='message',  # set the result to be "message" format.
68 |         )
69 |         if response.status_code != HTTPStatus.OK:
70 |             return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71 | 
72 |         try:
73 |             return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74 |         except Exception as err:
75 |             return -1, f'Error: Failed to parse the response. {err}', response
76 | 


--------------------------------------------------------------------------------
/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | from time import sleep
 4 | import mimetypes
 5 | 
 6 | 
 7 | class Reka_Wrapper(BaseAPI):
 8 | 
 9 |     is_api: bool = True
10 |     INTERLEAVE: bool = False
11 | 
12 |     def __init__(self,
13 |                  model: str = 'reka-flash-20240226',
14 |                  key: str = None,
15 |                  retry: int = 10,
16 |                  wait: int = 3,
17 |                  system_prompt: str = None,
18 |                  verbose: bool = True,
19 |                  temperature: float = 0,
20 |                  max_tokens: int = 1024,
21 |                  **kwargs):
22 | 
23 |         try:
24 |             import reka
25 |         except ImportError:
26 |             raise ImportError('Please install reka by running "pip install reka-api"')
27 | 
28 |         self.model = model
29 |         default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 |         default_kwargs.update(kwargs)
31 |         self.kwargs = default_kwargs
32 |         if key is not None:
33 |             self.key = key
34 |         else:
35 |             self.key = os.environ.get('REKA_API_KEY', '')
36 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 | 
38 |     def generate_inner(self, inputs, **kwargs) -> str:
39 |         import reka
40 |         reka.API_KEY = self.key
41 |         dataset = kwargs.pop('dataset', None)
42 |         prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
43 |         image_b64 = encode_image_file_to_base64(image_path)
44 | 
45 |         response = reka.chat(
46 |             model_name=self.model,
47 |             human=prompt,
48 |             media_url=f'data:image/jpeg;base64,{image_b64}',
49 |             **self.kwargs)
50 | 
51 |         try:
52 |             return 0, response['text'], response
53 |         except Exception as err:
54 |             return -1, self.fail_msg + str(err), response
55 | 
56 | 
57 | class Reka(Reka_Wrapper):
58 | 
59 |     def generate(self, message, dataset=None):
60 |         return super(Reka_Wrapper, self).generate(message)
61 | 


--------------------------------------------------------------------------------
/vlmeval/api/stepai.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | 
 4 | url = 'https://api.stepfun.com/v1/chat/completions'
 5 | headers = {
 6 |     'Content-Type': 'application/json',
 7 |     'Authorization': 'Bearer {}',
 8 | }
 9 | 
10 | 
11 | class StepAPI_INT(BaseAPI):
12 | 
13 |     is_api: bool = True
14 | 
15 |     def __init__(self,
16 |                  model: str = 'step-1v-8k',
17 |                  retry: int = 10,
18 |                  wait: int = 3,
19 |                  key: str = None,
20 |                  temperature: float = 0,
21 |                  max_tokens: int = 300,
22 |                  verbose: bool = True,
23 |                  system_prompt: str = None,
24 |                  **kwargs):
25 |         self.model = model
26 |         self.fail_msg = 'Fail to obtain answer via API.'
27 |         self.headers = headers
28 |         self.temperature = temperature
29 |         self.max_tokens = max_tokens
30 |         self.system_prompt = system_prompt
31 |         if key is not None:
32 |             self.key = key
33 |         else:
34 |             self.key = os.environ.get('STEPAI_API_KEY', '')
35 |         headers['Authorization'] = headers['Authorization'].format(self.key)
36 | 
37 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
38 | 
39 |     @staticmethod
40 |     def build_msgs(msgs_raw):
41 |         messages = []
42 |         message = {'role': 'user', 'content': []}
43 | 
44 |         for msg in msgs_raw:
45 |             if msg['type'] == 'image':
46 |                 image_b64 = encode_image_file_to_base64(msg['value'])
47 |                 message['content'].append({
48 |                     'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
49 |                     'type': 'image_url'
50 |                 })
51 |             elif msg['type'] == 'text':
52 |                 message['content'].append({
53 |                     'text': msg['value'],
54 |                     'type': 'text'
55 |                 })
56 | 
57 |         messages.append(message)
58 |         return messages
59 | 
60 |     def generate_inner(self, inputs, **kwargs) -> str:
61 |         print(inputs, '\n')
62 |         payload = dict(
63 |             model=self.model,
64 |             max_tokens=self.max_tokens,
65 |             temperature=self.temperature,
66 |             messages=self.build_msgs(msgs_raw=inputs),
67 |             **kwargs)
68 |         response = requests.post(url, headers=headers, data=json.dumps(payload))
69 |         ret_code = response.status_code
70 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
71 | 
72 |         answer = self.fail_msg
73 |         try:
74 |             resp_struct = json.loads(response.text)
75 |             answer = resp_struct['choices'][0]['message']['content'].strip()
76 |         except Exception as err:
77 |             if self.verbose:
78 |                 self.logger.error(f'{type(err)}: {err}')
79 |                 self.logger.error(response.text if hasattr(response, 'text') else response)
80 | 
81 |         return ret_code, answer, response
82 | 
83 | 
84 | class Step1V_INT(StepAPI_INT):
85 | 
86 |     def generate(self, message, dataset=None):
87 |         return super(StepAPI_INT, self).generate(message)
88 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/GUI/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/GUI/__init__.py


--------------------------------------------------------------------------------
/vlmeval/dataset/Omnidocbench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/Omnidocbench/__init__.py


--------------------------------------------------------------------------------
/vlmeval/dataset/Omnidocbench/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=0.26.0
 2 | apted
 3 | BeautifulSoup4
 4 | evaluate
 5 | func_timeout
 6 | jmespath
 7 | Levenshtein
 8 | lxml
 9 | nltk
10 | pylatexenc
11 | qwen_vl_utils
12 | scipy
13 | torchvision
14 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/emma.py:
--------------------------------------------------------------------------------
 1 | from vlmeval import *
 2 | from .image_shortqa import ImageShortQADataset
 3 | from .image_mcq import MMMUDataset
 4 | 
 5 | 
 6 | class EMMADataset(ImageShortQADataset):
 7 | 
 8 |     COT_INST = "Please solve the problem step by step. "
 9 |     DIRECT_INST = "Please ensure that your output only contains the final answer without any additional content (such as intermediate reasoning steps)."  # noqa: E501
10 |     MCQ_FMT = "{context}\n\n{question}\n\n{options}\n\nAnswer with the option's letter from the given choices. "
11 |     OPEN_FMT = "{context}\n\n{question}\n\nAnswer the question using a single word or phrase. "
12 | 
13 |     DATASET_URL = {
14 |         'EMMA': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv',
15 |         'EMMA_COT': 'https://opencompass.openxlab.space/utils/VLMEval/EMMA.tsv'
16 |     }
17 | 
18 |     def build_prompt(self, line):
19 |         if isinstance(line, int):
20 |             line = self.data.iloc[line]
21 | 
22 |         if self.meta_only:
23 |             tgt_path = toliststr(line['image_path'])
24 |         else:
25 |             tgt_path = self.dump_image(line)
26 | 
27 |         context = line['context']
28 |         question = line['question']
29 |         example = ""
30 |         _ = {}
31 |         if line['type'] == 'MCQ':
32 |             for ch in string.ascii_uppercase:
33 |                 if ch in line and not pd.isna(line[ch]):
34 |                     example += f"{ch}: {line[ch]}\n"
35 | 
36 |             prompt_tmpl = EMMADataset.MCQ_FMT
37 |             if not pd.isna(context) and context is not None:
38 |                 prompt = prompt_tmpl.format(context=context, question=question, options=example)
39 |             else:
40 |                 prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question, options=example)
41 |             prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST
42 |         else:
43 |             prompt_tmpl = EMMADataset.OPEN_FMT
44 |             if not pd.isna(context) and context is not None:
45 |                 prompt = prompt_tmpl.format(context=context, question=question)
46 |             else:
47 |                 prompt = prompt_tmpl.split('{context}\n\n')[1].format(question=question)
48 |             prompt += EMMADataset.COT_INST if 'COT' in self.dataset_name else EMMADataset.DIRECT_INST
49 | 
50 |         msgs = []
51 |         if isinstance(tgt_path, list):
52 |             msgs.extend([dict(type='image', value=p) for p in tgt_path])
53 |         else:
54 |             msgs = [dict(type='image', value=tgt_path)]
55 |         msgs.append(dict(type='text', value=prompt))
56 |         return MMMUDataset.split_MMMU(msgs)
57 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/image_caption.py:
--------------------------------------------------------------------------------
 1 | from .image_base import ImageBaseDataset
 2 | from ..smp import *
 3 | 
 4 | 
 5 | class COCO_Caption_Scorer():
 6 |     def __init__(self, ref, gt):
 7 |         from pycocoevalcap.bleu.bleu import Bleu
 8 |         from pycocoevalcap.rouge.rouge import Rouge
 9 |         from pycocoevalcap.cider.cider import Cider
10 | 
11 |         self.ref = ref
12 |         self.gt = gt
13 |         print('setting up scorers...')
14 |         self.scorers = [
15 |             (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
16 |             (Rouge(), 'ROUGE_L'),
17 |             (Cider(), 'CIDEr'),
18 |         ]
19 | 
20 |     def compute_scores(self):
21 |         total_scores = {}
22 |         for scorer, method in self.scorers:
23 |             print('computing %s score...' % (scorer.method()))
24 |             score, scores = scorer.compute_score(self.gt, self.ref)
25 |             if isinstance(method, list):
26 |                 for sc, scs, m in zip(score, scores, method):
27 |                     print('%s: %0.3f' % (m, sc * 100))
28 |                 total_scores['Bleu'] = [x * 100 for x in score]
29 |             else:
30 |                 print('%s: %0.3f' % (method, score * 100))
31 |                 total_scores[method] = score * 100
32 | 
33 |         print('*****DONE*****')
34 |         for key, value in total_scores.items():
35 |             print('{}:{}'.format(key, value))
36 |         return total_scores
37 | 
38 | 
39 | class ImageCaptionDataset(ImageBaseDataset):
40 | 
41 |     TYPE = 'Caption'
42 | 
43 |     DATASET_URL = {
44 |         'COCO_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv',
45 |     }
46 | 
47 |     DATASET_MD5 = {
48 |         'COCO_VAL': '72a5079dead060269ac222c5aa5128af',
49 |     }
50 | 
51 |     def load_data(self, dataset):
52 |         data = super().load_data(dataset)
53 |         if 'question' not in data:
54 |             data['question'] = [(
55 |                 'Please describe this image in general. Directly provide the description, '
56 |                 'do not include prefix like "This image depicts". '
57 |             )] * len(data)
58 |         return data
59 | 
60 |     # It returns a dictionary of scores
61 |     @classmethod
62 |     def evaluate(self, eval_file, **kwargs):
63 |         data = load(eval_file)
64 |         lt = len(data)
65 |         lines = [data.iloc[i] for i in range(lt)]
66 |         ref, gt = {}, {}
67 |         for i, line in enumerate(lines):
68 |             ref[str(i)] = [str(line['prediction'])]
69 |             gt[str(i)] = eval(line['answer'])
70 | 
71 |         scorer = COCO_Caption_Scorer(ref, gt)
72 |         coco_caption_score_dict = scorer.compute_scores()
73 |         score_pth = eval_file.replace('.xlsx', '_score.json')
74 |         dump(coco_caption_score_dict, score_pth)
75 |         return coco_caption_score_dict
76 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/mmgenbench.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import pandas as pd
 3 | from abc import abstractmethod
 4 | from ..smp import *
 5 | from .image_base import ImageBaseDataset
 6 | 
 7 | 
 8 | class MMGenBench(ImageBaseDataset):
 9 | 
10 |     prompt_list = [
11 |         """
12 | # Role
13 | You are an expert in the field of image understanding, focusing on the \
14 | understanding of images and generating the image caption-prompt.
15 | 
16 | # Definition Explanation
17 | image caption-prompt: Refers to the caption or description of an image, \
18 | used to provide to a Text-to-Image model to generate a new image.
19 | Text-to-Image model: Can generate a new image based on the provided image \
20 | caption-prompt, such as stable diffusion 3, flux, and other image generation models.
21 | 
22 | # Task Description
23 | Generate an image caption-prompt based on the input image.
24 | 
25 | # Key Points and Requirements
26 | 1. Accurately understand the input image and precisely generate an image caption-prompt.
27 | 2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the \
28 | Text-to-Image model to generate a new image that is as consistent as possible with the input image.
29 | 3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
30 | 4. The generated image caption-prompt should describe the input image in as much \
31 | detail as possible, and it should be between 20 to 60 words.
32 | 
33 | # Output Format
34 | A string, that is the image caption-prompt. No extra output needed.
35 | """
36 |     ]
37 |     TYPE = 'GenerateImgPrompt'
38 |     DATASET_URL = {
39 |         'MMGenBench-Test': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv',
40 |         'MMGenBench-Domain': 'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv',
41 |     }
42 |     PROMPT_MAP = {
43 |         'MMGenBench-Test': prompt_list[0],
44 |         'MMGenBench-Domain': prompt_list[0],
45 |     }
46 |     DATASET_MD5 = {
47 |         'MMGenBench-Test': "94f8dac6bbf7c20be403f99adeaa73da",
48 |         'MMGenBench-Domain': "5c10daf6e2c5f08bdfb0701aa6db86bb",
49 |     }
50 | 
51 |     def __init__(self, dataset='MMGenBench', **kwargs):
52 |         super().__init__(dataset, **kwargs)
53 |         warnings.warn('This dataset is for inference only and does not support direct output of evaluation results.\n')
54 |         warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
55 | 
56 |     def load_data(self, dataset):
57 |         data = super().load_data(dataset)
58 |         if 'question' not in data:
59 |             data['question'] = [(
60 |                 self.PROMPT_MAP[dataset]
61 |             )] * len(data)
62 |         return data
63 | 
64 |     # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
65 |     @abstractmethod
66 |     def evaluate(self, eval_file, **judge_kwargs):
67 |         warnings.warn('This evaluation method is not supported.\n')
68 |         warnings.warn('Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.\n')
69 |         return None
70 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/text_base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from ..smp import *
 3 | 
 4 | 
 5 | class TextBaseDataset:
 6 |     MODALITY = 'TEXT'
 7 |     DATASET_URL = {}
 8 |     DATASET_MD5 = {}
 9 | 
10 |     def __init__(self, dataset='MMBench', **kwargs):
11 |         self.dataset_name = dataset
12 | 
13 |         data = self.load_data(dataset)
14 | 
15 |         data['index'] = [str(x) for x in data['index']]
16 | 
17 |         if np.all([istype(x, int) for x in data['index']]):
18 |             data['index'] = [int(x) for x in data['index']]
19 | 
20 |         self.data = data
21 |         self.post_build(dataset)
22 | 
23 |     def __len__(self):
24 |         return len(self.data)
25 | 
26 |     def __getitem__(self, idx):
27 |         return dict(self.data.iloc[idx])
28 | 
29 |     def prepare_tsv(self, url, file_md5=None):
30 |         data_root = LMUDataRoot()
31 |         os.makedirs(data_root, exist_ok=True)
32 |         update_flag = False
33 |         file_name = url.split('/')[-1]
34 |         data_path = osp.join(data_root, file_name)
35 |         if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
36 |             pass
37 |         else:
38 |             warnings.warn('The dataset tsv is not downloaded')
39 |             download_file(url, data_path)
40 |             update_flag = True
41 | 
42 |         if file_size(data_path, 'GB') > 1:
43 |             local_path = data_path.replace('.tsv', '_local.tsv')
44 |             if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
45 |                 from ..tools import LOCALIZE
46 |                 LOCALIZE(data_path, local_path)
47 |             data_path = local_path
48 |         return load(data_path)
49 | 
50 |     def dump_image(self, line):
51 |         return []
52 | 
53 |     def display(self, line):
54 |         if isinstance(line, int):
55 |             line = self.data.iloc[line]
56 |         assert isinstance(line, pd.Series) or isinstance(line, dict)
57 |         mmqa_display(line)
58 | 
59 |     # Return a list of dataset names that are supported by this class, can override
60 |     @classmethod
61 |     def supported_datasets(cls):
62 |         return list(cls.DATASET_URL)
63 | 
64 |     # Given the dataset name, return the dataset as a pandas dataframe, can override
65 |     def load_data(self, dataset):
66 |         url = self.DATASET_URL[dataset]
67 |         file_md5 = self.DATASET_MD5[dataset]
68 |         return self.prepare_tsv(url, file_md5)
69 | 
70 |     # Post built hook, will be called after the dataset is built, can override
71 |     def post_build(self, dataset):
72 |         pass
73 | 
74 |     # Given one data record, return the built prompt (a multi-modal message), can override
75 |     def build_prompt(self, line):
76 |         if isinstance(line, int):
77 |             line = self.data.iloc[line]
78 | 
79 |         question = line['question']
80 | 
81 |         msgs = []
82 |         msgs.append(dict(type='text', value=question))
83 |         return msgs
84 | 
85 |     # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
86 |     @abstractmethod
87 |     def evaluate(self, eval_file, **judge_kwargs):
88 |         pass
89 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .judge_util import build_judge, DEBUG_MESSAGE
 2 | from .multiple_choice import extract_answer_from_item, prefetch_answer
 3 | from .vqa_eval import levenshtein_distance
 4 | from .spatial457 import Spatial457_utils
 5 | 
 6 | 
 7 | __all__ = [
 8 |     'build_judge', 'extract_answer_from_item', 'prefetch_answer',
 9 |     'levenshtein_distance', 'DEBUG_MESSAGE',
10 |     'Spatial457_utils'
11 | ]
12 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/ccocr_evaluator/__init__.py:
--------------------------------------------------------------------------------
 1 | from .kie_evaluator import KieEvaluator
 2 | from .doc_parsing_evaluator import ParsingEvaluator
 3 | from .ocr_evaluator import OcrEvaluator
 4 | from .common import summary
 5 | 
 6 | 
 7 | evaluator_map_info = {
 8 |     "kie": KieEvaluator("kie"),
 9 |     "doc_parsing": ParsingEvaluator("doc_parsing"),
10 |     "multi_lan_ocr": OcrEvaluator("multi_lan_ocr"),
11 |     "multi_scene_ocr": OcrEvaluator("multi_scene_ocr")
12 | }
13 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/crpe.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def is_correct(predict, answer):
 7 |     # predict是标准答案 answer是预测
 8 |     if len(answer) == 1:
 9 |         return answer[0] == predict[0]
10 |     elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
11 |         return answer[0] == predict[0]
12 |     elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
13 |         return predict[4:].lower() in answer.lower()
14 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/hrbench.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | import os
 3 | 
 4 | 
 5 | def report_acc_hrbench(df):
 6 |     cycle_group = df.groupby('cycle_category')
 7 |     result_dic = defaultdict(list)
 8 |     avg_dic = defaultdict(int)
 9 | 
10 |     count = 0
11 |     for key, data_value in cycle_group:
12 |         count += 1
13 |         _, resp_dic = hrbench_score(data_value)
14 | 
15 |         for task_type, accuracy in resp_dic.items():
16 |             result_dic['cycle'].append(key)
17 |             result_dic['type'].append(task_type)
18 |             result_dic['accuracy'].append(accuracy)
19 | 
20 |             avg_dic[task_type] += accuracy
21 |     for task_type, accuracy in avg_dic.items():
22 |         result_dic['cycle'].append('Average')
23 |         result_dic['type'].append(task_type)
24 |         result_dic['accuracy'].append(accuracy / count)
25 |     result_pd = pd.DataFrame(result_dic)
26 | 
27 |     return result_pd
28 | 
29 | 
30 | def hrbench_score(data):
31 |     ret = defaultdict(list)
32 |     resp_dic = {}
33 |     category_list = set(data['category'])
34 |     score_dict = defaultdict(list)
35 | 
36 |     for i in range(len(data)):
37 |         d = data.iloc[i]
38 |         category = d['category']
39 |         gpt_score = d['hit']
40 |         score_dict[category].append(gpt_score)
41 |         score_dict['all'].append(gpt_score)
42 | 
43 |     all_acc = np.mean(score_dict['all'])
44 |     ret['type'].append('all')
45 |     ret['acc'].append(all_acc)
46 |     resp_dic['all'] = all_acc
47 |     for cate in category_list:
48 |         acc = np.mean(score_dict[cate])
49 |         ret['type'].append(cate)
50 |         ret['acc'].append(acc)
51 | 
52 |         resp_dic[cate] = acc
53 | 
54 |     return pd.DataFrame(ret), resp_dic
55 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/judge_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ...smp import load_env
 3 | 
 4 | INTERNAL = os.environ.get('INTERNAL', 0)
 5 | 
 6 | 
 7 | def build_judge(**kwargs):
 8 |     from ...api import OpenAIWrapper, SiliconFlowAPI, HFChatModel
 9 |     model = kwargs.pop('model', None)
10 |     kwargs.pop('nproc', None)
11 |     load_env()
12 |     LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
13 |     if LOCAL_LLM is None:
14 |         model_map = {
15 |             'gpt-4-turbo': 'gpt-4-1106-preview',
16 |             'gpt-4-0613': 'gpt-4-0613',
17 |             'gpt-4-0125': 'gpt-4-0125-preview',
18 |             'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
19 |             'chatgpt-1106': 'gpt-3.5-turbo-1106',
20 |             'chatgpt-0125': 'gpt-3.5-turbo-0125',
21 |             'gpt-4o': 'gpt-4o-2024-05-13',
22 |             'gpt-4o-0806': 'gpt-4o-2024-08-06',
23 |             'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
24 |             'qwen-7b': 'Qwen/Qwen2.5-7B-Instruct',
25 |             'qwen-72b': 'Qwen/Qwen2.5-72B-Instruct',
26 |             'deepseek': 'deepseek-ai/DeepSeek-V3',
27 |             'llama31-8b': 'meta-llama/Llama-3.1-8B-Instruct',
28 |         }
29 |         model_version = model_map[model]
30 |     else:
31 |         model_version = LOCAL_LLM
32 | 
33 |     if model in ['qwen-7b', 'qwen-72b', 'deepseek']:
34 |         model = SiliconFlowAPI(model_version, **kwargs)
35 |     elif model == 'llama31-8b':
36 |         model = HFChatModel(model_version, **kwargs)
37 |     else:
38 |         model = OpenAIWrapper(model_version, **kwargs)
39 |     return model
40 | 
41 | 
42 | DEBUG_MESSAGE = """
43 | To debug the OpenAI API, you can try the following scripts in python:
44 | ```python
45 | from vlmeval.api import OpenAIWrapper
46 | model = OpenAIWrapper('gpt-4o', verbose=True)
47 | msgs = [dict(type='text', value='Hello!')]
48 | code, answer, resp = model.generate_inner(msgs)
49 | print(code, answer, resp)
50 | ```
51 | You cam see the specific error if the API call fails.
52 | """
53 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/longvideobench.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | from .multiple_choice import extract_answer_from_item
 3 | import numpy as np
 4 | import re
 5 | 
 6 | FAIL_MSG = 'Failed to obtain answer via API.'
 7 | 
 8 | DURATIONS = [15, 60, 600, 3600]
 9 | TASK_CATEGORIES = [
10 |     "S2E", "S2O", "S2A",
11 |     "E2O", "O2E", "T2E",
12 |     "T2O", "T2A", "E3E",
13 |     "O3O", "SSS", "SOS",
14 |     "SAA", "T3E", "T3O",
15 |     "TOS", "TAA"
16 | ]
17 | 
18 | 
19 | def get_dimension_rating(data_path):
20 |     data = load(data_path)
21 |     print(data.iloc[0])
22 | 
23 |     duration_rating = {k: {} for k in DURATIONS}
24 |     for duration in DURATIONS + ['overall']:
25 |         duration_rating[duration] = {
26 |             'overall': '',
27 |             'question_category': {k: [] for k in TASK_CATEGORIES}
28 |         }
29 | 
30 |     for i in range(len(data)):
31 | 
32 |         task_ctg = data.iloc[i]['question_category']
33 | 
34 |         duration = data.iloc[i]['duration_group']
35 |         duration_rating[duration]['question_category'][task_ctg].append(data.iloc[i]['score'])
36 | 
37 |         duration_rating['overall']['question_category'][task_ctg].append(data.iloc[i]['score'])
38 | 
39 |     for duration in DURATIONS + ['overall']:
40 |         overall_res_dur = f'{np.mean([x for x in sum(duration_rating[duration]["question_category"].values(), []) if x >= 0]):.3f}'  # noqa: E501
41 |         duration_rating[duration]['overall'] = overall_res_dur
42 |         for task_ctg in TASK_CATEGORIES:
43 |             task_res_dur = f'{np.mean([x for x in duration_rating[duration]["question_category"][task_ctg] if x >= 0]):.3f}'  # noqa: E501
44 |             duration_rating[duration]['question_category'][task_ctg] = task_res_dur
45 | 
46 |     return duration_rating
47 | 
48 | 
49 | def extract_option(model, input_item, dataset_name):
50 |     options = input_item['question'].split('\n')[1:]
51 |     for id, option in enumerate(options):
52 |         option_id = chr(ord('A') + id) + '.'
53 |         if option.find(option_id) >= 0:
54 |             input_item[chr(ord('A') + id)] = option[option.find(option_id) + len(option_id):].strip('. \n')
55 |     return extract_answer_from_item(model, input_item, dataset_name)['opt']
56 | 
57 | 
58 | def extract_characters_regex(s):
59 |     s = s.strip()
60 |     answer_prefixes = [
61 |         'The best answer is',
62 |         'The correct answer is',
63 |         'The answer is',
64 |         'The answer',
65 |         'The best option is'
66 |         'The correct option is',
67 |         'Best answer:'
68 |         'Best option:',
69 |         'Answer:',
70 |         'Option:',
71 |     ]
72 |     for answer_prefix in answer_prefixes:
73 |         s = s.replace(answer_prefix, '')
74 | 
75 |     if len(s.split()) > 10 and not re.search('[ABCDE]', s):
76 |         return ''
77 |     matches = re.search(r'[ABCDE]', s)
78 |     if matches is None:
79 |         return ''
80 |     return matches[0]
81 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/__init__.py:
--------------------------------------------------------------------------------
1 | from .aggregation_type import AggregationType
2 | from .metric_type import MetricType
3 | from .response_parse_type import ResponseParseType
4 | 
5 | __all__ = [AggregationType, MetricType, ResponseParseType]
6 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/mean_agg.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from typing import Dict
 3 | import numpy as np
 4 | 
 5 | 
 6 | class MeanAggregation:
 7 |     """Take the mean of all valid scores."""
 8 | 
 9 |     @staticmethod
10 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
11 |         """Exact match between targets and responses."""
12 |         filtered_scores = {f: s for f, s in scores.items() if s >= 0}
13 |         if not filtered_scores:
14 |             return -1
15 | 
16 |         # Align the key order
17 |         flattened_scores = []
18 |         flattened_weights = []
19 |         for field in filtered_scores:
20 |             flattened_scores.append(filtered_scores[field])
21 |             flattened_weights.append(weights[field])
22 |         return np.average(flattened_scores, weights=flattened_weights)
23 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/min_agg.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from typing import Dict
 3 | 
 4 | 
 5 | class MinAggregation:
 6 |     """Take the minimum of all valid scores."""
 7 | 
 8 |     @staticmethod
 9 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
10 |         """Exact match between targets and responses."""
11 |         filtered_scores = [s for s in scores.values() if s >= 0]
12 |         if not filtered_scores:
13 |             return -1
14 |         return min(filtered_scores)
15 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation/unsupported_agg.py:
--------------------------------------------------------------------------------
1 | from numbers import Number
2 | from typing import Dict
3 | 
4 | 
5 | class UnsupportedAggregation:
6 |     @staticmethod
7 |     def aggregate(scores: Dict[str, Number], weights: Dict[str, Number]) -> Number:
8 |         return -1
9 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/aggregation_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | class AggregationType(Enum):
 4 |     MEAN = 0
 5 | 
 6 |     @classmethod
 7 |     def from_string(cls, s):
 8 |         return cls.MEAN
 9 | 
10 |     def aggregate(self, field_scores, field_weights):
11 |         if not field_scores:
12 |             return 0.0
13 | 
14 |         total_score = 0.0
15 |         total_weight = 0.0
16 | 
17 |         for field, score in field_scores.items():
18 |             weight = field_weights.get(field, 1.0)
19 |             try:
20 |                 total_score += score * weight
21 |             except:
22 |                 total_score += score[0] * weight
23 |             total_weight += weight
24 | 
25 |         return total_score / total_weight if total_weight > 0 else 0.0
26 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/parsing/dummy_parse.py:
--------------------------------------------------------------------------------
1 | class DummyParse:
2 | 
3 |     @staticmethod
4 |     def parse(response: str, *args, **kwargs) -> dict:
5 |         """return the raw string without doing anything"""
6 |         return response.strip()
7 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/parsing/json_parse.py:
--------------------------------------------------------------------------------
 1 | from .common.parsers import parse_json
 2 | from .common.utils import evaluate_as_string
 3 | 
 4 | 
 5 | class JsonParse:
 6 |     """Load the response as a JSON object."""
 7 | 
 8 |     @staticmethod
 9 |     def parse(response: str):
10 |         """Parse the JSON object, including nested JSON strings."""
11 |         parsed_res = parse_json(response)
12 |         # Drop the potentially duplicated string quotes
13 |         if isinstance(parsed_res, dict):
14 |             for key, val in parsed_res.items():
15 |                 parsed_res[key] = evaluate_as_string(val)
16 | 
17 |         return parsed_res
18 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/requirements.txt:
--------------------------------------------------------------------------------
 1 | antlr4-python3-runtime==4.11.0
 2 | filelock==3.16.1
 3 | geopy==2.4.1
 4 | jieba==0.42.1
 5 | nltk==3.9.1
 6 | numpy==1.26.4
 7 | pronouncing==0.2.0
 8 | rapidfuzz==3.9.5
 9 | regex==2024.7.24
10 | requests==2.32.3
11 | requests_cache==1.2.1
12 | sacrebleu==2.4.3
13 | sympy==1.13.2
14 | tqdm==4.66.4
15 | Unidecode==1.3.8
16 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/response_parse_type.py:
--------------------------------------------------------------------------------
 1 | from functools import cached_property
 2 | from enum import Enum
 3 | from .parsing.json_parse import JsonParse
 4 | from .parsing.answer_str_parse import (
 5 |     AnswerStrParse,
 6 |     AsciiAnswerStrParse,
 7 |     VerbatimAnswerStrParse,
 8 | )
 9 | from vlmeval.dataset.utils.megabench.parsing.dummy_parse import DummyParse
10 | 
11 | 
12 | class ResponseParseType(Enum):
13 |     """Parse the response."""
14 | 
15 |     JSON = "json"
16 |     ANSWER_STR = "answer_string"
17 |     ASCII_ANSWER_STR = "ascii_answer_string"
18 |     VERBATIM_ANSWER_STR = "verbatim_answer_string"
19 |     DUMMY = "dummy"
20 |     UNSUPPORTED = "unsupported"
21 | 
22 |     @cached_property
23 |     def class_impl(self):
24 |         if self == ResponseParseType.ANSWER_STR:
25 |             return AnswerStrParse
26 |         elif self == ResponseParseType.ASCII_ANSWER_STR:
27 |             return AsciiAnswerStrParse
28 |         elif self == ResponseParseType.VERBATIM_ANSWER_STR:
29 |             return VerbatimAnswerStrParse
30 |         elif self == ResponseParseType.DUMMY:
31 |             return DummyParse
32 |         else:
33 |             return JsonParse
34 | 
35 |     def is_single_field_parser(self):
36 |         return self in [
37 |             ResponseParseType.ANSWER_STR,
38 |             ResponseParseType.ASCII_ANSWER_STR,
39 |             ResponseParseType.VERBATIM_ANSWER_STR,
40 |         ]
41 | 
42 |     def parse(self, response: str, *args, **kwargs):
43 |         """Parse the response."""
44 |         return self.class_impl.parse(response, *args, **kwargs)
45 | 
46 |     @staticmethod
47 |     def from_string(s):
48 |         """Initialize the response parsing type from a string."""
49 |         try:
50 |             if s is None:
51 |                 return ResponseParseType("unsupported")
52 |             return ResponseParseType(s.lower())
53 |         except KeyError as exc:
54 |             raise ValueError(f"Invalid metric type: {s}") from exc
55 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/chess_jaccard.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Dict, Any
 3 | from .common.conversions import str_to_set
 4 | from .common.metrics import jaccard_index
 5 | 
 6 | 
 7 | def chess_transform(move_sequence: str) -> set:
 8 |     """Transform a sequence of chess moves encoded in SAN into a set."""
 9 |     move_sequence = str_to_set(move_sequence)
10 |     return {move_san.removesuffix("!").removesuffix("#") for move_san in move_sequence}
11 | 
12 | 
13 | class ChessMoveJaccard:
14 |     """Calculates the Jacard index for chess moves."""
15 | 
16 |     @classmethod
17 |     def match(cls, responses: str | None, targets: str) -> float:
18 |         """Exact match between targets and responses."""
19 |         if responses is None:
20 |             return 0
21 |         responses = chess_transform(responses)
22 |         targets = chess_transform(targets)
23 | 
24 |         return jaccard_index(responses, targets)
25 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/coordinate_sequence_match.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .common.conversions import str_to_coords
 3 | import numpy as np
 4 | 
 5 | 
 6 | class CoordsSequenceSimilarity:
 7 |     """
 8 |     Measure the similarity between two list of coordinates, used for keypoint estimation tasks
 9 |     """
10 | 
11 |     @staticmethod
12 |     def compute_score(pred_keypoints, gt_keypoints, k=10):
13 |         """
14 |         Compute the evaluation score for keypoint estimation.
15 | 
16 |         Args:
17 |             pred_keypoints (list or np.ndarray): List or array of predicted keypoint coordinates,
18 |                                                  each as (x, y), normalized to [0, 1].
19 |             gt_keypoints (list or np.ndarray): List or array of ground truth keypoint coordinates,
20 |                                                each as (x, y), normalized to [0, 1].
21 | 
22 |         Returns:
23 |             float: A score between 0 and 1, where 1 indicates perfect accuracy,
24 |                    and 0 indicates completely wrong.
25 |         """
26 |         # Convert inputs to NumPy arrays
27 |         try:
28 |             pred_keypoints = np.array(pred_keypoints)
29 |         except ValueError:
30 |             # Format is not a correct
31 |             return 0
32 | 
33 |         gt_keypoints = np.array(gt_keypoints)
34 | 
35 |         # shape mismatch, directly assign 0 score
36 |         if pred_keypoints.shape != gt_keypoints.shape:
37 |             return 0
38 | 
39 |         # Compute Euclidean distances between corresponding keypoints
40 |         distances = np.linalg.norm(pred_keypoints - gt_keypoints, axis=1)
41 | 
42 |         # Maximum possible distance in normalized coordinate space
43 |         max_distance = np.sqrt(2)
44 | 
45 |         # Normalize distances
46 |         normalized_distances = distances / max_distance
47 | 
48 |         # Compute per-keypoint scores using exponential decay
49 |         per_keypoint_scores = np.exp(-k * normalized_distances)
50 | 
51 |         # Compute the average score across all keypoints
52 |         score = np.mean(per_keypoint_scores)
53 | 
54 |         return score
55 | 
56 |     @classmethod
57 |     def match(cls, responses, targets) -> float:
58 |         """Exact match between targets and responses."""
59 |         logging.debug(f"{responses=}, {targets=}")
60 |         if not isinstance(responses, (tuple | list)):
61 |             responses = str_to_coords(responses, dim=2)
62 |         if not isinstance(targets, (tuple | list)):
63 |             targets = str_to_coords(targets, dim=2)
64 | 
65 |         return cls.compute_score(responses, targets)
66 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_dict
 2 | from .simple_str_match import ExactStrMatch
 3 | 
 4 | 
 5 | class DictEquality:
 6 |     """Calculates the exact string match across the dict.
 7 | 
 8 |     1. Calculates the exact match for all keys in the solution
 9 |     2. Calculates the total, then divides by the size of the solution
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         responses = cast_to_dict(responses)
16 |         targets = cast_to_dict(targets)
17 | 
18 |         if not isinstance(responses, dict):
19 |             return 0
20 | 
21 |         return 1 if responses == targets else 0
22 | 
23 | 
24 | class DictPrecision:
25 | 
26 |     @classmethod
27 |     def match(cls, responses, targets) -> float:
28 |         """Return the aggregated Jaccard index between targets and responses."""
29 |         responses = cast_to_dict(responses)
30 |         targets = cast_to_dict(targets)
31 | 
32 |         if not isinstance(responses, dict):
33 |             return 0
34 | 
35 |         if len(responses) == 0:
36 |             return 0
37 | 
38 |         matched = 0
39 |         for key, val in responses.items():
40 |             if key in targets:
41 |                 if ExactStrMatch.match(val, targets[key]):
42 |                     matched += 1
43 | 
44 |         return matched / len(responses)
45 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_exact_match_agg_recall.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_dict
 2 | from .exact_str_match import ExactStrMatch
 3 | 
 4 | 
 5 | class DictExactStrMatchAggRecall:
 6 |     """Calculates the exact string match across the dict.
 7 | 
 8 |     1. Calculates the exact match for all keys in the solution
 9 |     2. Calculates the total, then divides by the size of the solution
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         responses = cast_to_dict(responses)
16 |         targets = cast_to_dict(targets)
17 | 
18 |         if not isinstance(responses, dict):
19 |             return 0
20 | 
21 |         num_keys = 0
22 |         total_score = 0
23 |         for key, answer in targets.items():
24 |             total_score += ExactStrMatch.match(responses.get(key), answer)
25 |             num_keys += 1
26 | 
27 |         return total_score / num_keys
28 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_jaccard_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from .jaccard import Jaccard
 2 | from .common.conversions import cast_to_dict
 3 | 
 4 | 
 5 | class DictJaccardAggJaccard:
 6 |     """Calculates the Jaccard index, dividing by the union of the predictions.
 7 | 
 8 |     1. Calculates the Jaccard index for all sets with the same key,
 9 |     if it appears in either pred or targets
10 |     2. Calculates the total, then divides by the size of the union
11 |     """
12 | 
13 |     @classmethod
14 |     def match(cls, responses, targets) -> float:
15 |         """Return the aggregated Jaccard index between targets and responses."""
16 |         responses = cast_to_dict(responses)
17 |         if not isinstance(responses, dict):
18 |             return 0
19 | 
20 |         all_keys = set(responses) | set(targets)
21 | 
22 |         num_keys = 0
23 |         total_score = 0
24 |         for key in all_keys:
25 |             total_score += Jaccard.match(responses.get(key, []), targets.get(key, []))
26 |             num_keys += 1
27 | 
28 |         return total_score / num_keys
29 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_nbbox_iou_tuple_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from .nbbox_iou import NbboxIouTuple
 2 | 
 3 | 
 4 | class DictNbboxIouTupleAggJaccard:
 5 |     """Calculates the average precision IoU across the dict.
 6 | 
 7 |     1. Calculates the precision IoU for all sets with the same key,
 8 |     if it appears in either pred or targets
 9 |     2. Calculates the total, then divides by the size of the union
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         if not isinstance(responses, dict):
16 |             return 0
17 |         all_keys = set(responses) | set(targets)
18 | 
19 |         num_keys = 0
20 |         total_score = 0
21 |         for key in all_keys:
22 |             total_score += NbboxIouTuple.match(
23 |                 responses.get(key, []), targets.get(key, [])
24 |             )
25 |             num_keys += 1
26 | 
27 |         return total_score / num_keys
28 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/dict_set_equality_agg_jaccard.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.dataset.utils.megabench.scoring.set_equality import SetEquality
 2 | 
 3 | 
 4 | class DictSetEqualityAggJaccard:
 5 |     """Calculates the average set equality across the dict.
 6 | 
 7 |     1. Calculates the set equality for all sets with the same key,
 8 |     if it appears in either pred or targets
 9 |     2. Calculates the total, then divides by the size of the union
10 |     """
11 | 
12 |     @classmethod
13 |     def match(cls, responses, targets) -> float:
14 |         """Return the aggregated Jaccard index between targets and responses."""
15 |         if not isinstance(responses, dict):
16 |             return 0
17 | 
18 |         all_keys = set(responses) | set(targets)
19 | 
20 |         num_keys = 0
21 |         total_score = 0
22 |         for key in all_keys:
23 |             total_score += SetEquality.match(
24 |                 responses.get(key, []), targets.get(key, [])
25 |             )
26 |             num_keys += 1
27 | 
28 |         return total_score / num_keys
29 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/exact_str_match.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from ..parsing.common.utils import extract_code_block_content
 3 | 
 4 | 
 5 | def parse_single_letter(s):
 6 |     # Regular expression to match (A)XXXXX, A . XXXXXXX, or A.XXXXXX
 7 |     match = re.match(r"^\(?([A-Za-z])\)?(?:\s*\.\s*|\.)?(.*)", s)
 8 | 
 9 |     if match:
10 |         # Extract and return the single letter
11 |         return match.group(1)
12 |     else:
13 |         # Return the original string if no match is found
14 |         return s
15 | 
16 | 
17 | class ExactStrMatch:
18 |     """Exact string matching."""
19 | 
20 |     @staticmethod
21 |     def match(response: str, correct_answer: str) -> int:
22 |         """Exact match between targets and responses."""
23 |         if not isinstance(response, str):
24 |             response = str(response)
25 |         if not isinstance(correct_answer, str):
26 |             correct_answer = str(correct_answer)
27 | 
28 |         if len(correct_answer) == 1 and correct_answer.isalpha() and len(response) > 1:
29 |             # handle special case of choice letter,
30 |             # drop the potential parenthesis
31 |             response = parse_single_letter(response)
32 | 
33 |         return 1 if response == correct_answer else 0
34 | 
35 | 
36 | class CodeResultExactStrMatch:
37 |     """Exact string matching, with the results from a results code block."""
38 | 
39 |     @staticmethod
40 |     def match(response: str, correct_answer: str) -> int:
41 |         """Exact match between targets and responses."""
42 |         correct_answer, is_code = extract_code_block_content(
43 |             correct_answer,
44 |             is_ascii_art=True,
45 |             should_remove_surrounding_whitespace=False,
46 |         )
47 |         # assert is_code
48 |         return ExactStrMatch.match(response, correct_answer)
49 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/exact_str_match_case_insensitive.py:
--------------------------------------------------------------------------------
 1 | from .exact_str_match import ExactStrMatch
 2 | 
 3 | 
 4 | class ExactStrMatchCaseInsensitive:
 5 |     """Case-insensitive exact string matching."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer) -> int:
 9 |         """Case-insensitive exact match between targets and responses."""
10 |         if not isinstance(response, str) and isinstance(correct_answer, str):
11 |             return 0
12 |         return ExactStrMatch.match(response.lower(), correct_answer.lower())
13 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/gleu.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | import jieba
 3 | from nltk.translate.gleu_score import sentence_gleu
 4 | 
 5 | 
 6 | class GLEUChinese:
 7 |     """Compute GLEU score for Chinese text."""
 8 | 
 9 |     @staticmethod
10 |     def match(response, correct_answer) -> Number:
11 |         """Compute the BLEU scores between two strings."""
12 |         if isinstance(response, str) and isinstance(correct_answer, str):
13 |             reference_tokens = list(jieba.cut_for_search(response))
14 |             translation_tokens = list(jieba.cut_for_search(correct_answer))
15 |         else:
16 |             return 0
17 |         return sentence_gleu([reference_tokens], translation_tokens)
18 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/jaccard.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set
 2 | from .common.metrics import jaccard_index
 3 | 
 4 | 
 5 | class Jaccard:
 6 |     """Calculates the Jacard index for iterables."""
 7 | 
 8 |     @classmethod
 9 |     def match(cls, responses, targets) -> float:
10 |         """Exact match between targets and responses."""
11 |         if responses is None:
12 |             return 0
13 |         responses = cast_to_set(responses)
14 |         targets = cast_to_set(targets)
15 | 
16 |         return jaccard_index(responses, targets)
17 | 
18 | 
19 | class JaccardCaseInsensitive:
20 |     """Calculates the Jacard index for iterables of strings,
21 |     Do not consider the case
22 |     """
23 | 
24 |     @classmethod
25 |     def match(cls, responses, targets) -> float:
26 |         """Exact match between targets and responses."""
27 |         if responses is None:
28 |             return 0
29 |         responses = cast_to_set(responses)
30 |         targets = cast_to_set(targets)
31 | 
32 |         if isinstance(list(targets)[0], str):
33 |             new_responses = {
34 |                 item.lower() if isinstance(item, str) else str(item).lower()
35 |                 for item in responses
36 |             }
37 |             new_targets = {item.lower() for item in targets}
38 |         elif isinstance(list(targets)[0], tuple):
39 |             new_responses = set()
40 |             new_targets = set()
41 |             try:
42 |                 for res in responses:
43 |                     new_res = tuple(
44 |                         [
45 |                             item.lower()
46 |                             .replace(" ", "")
47 |                             .replace("-", "")
48 |                             .replace("\n", "")
49 |                             .replace("\t", "")
50 |                             .replace("_", "")
51 |                             .replace(".", "")
52 |                             for item in res
53 |                         ]
54 |                     )
55 |                     new_responses.add(new_res)
56 |             except:  # the data type of the response might be wrong, return 0 in this case
57 |                 return 0
58 |             for tgt in targets:
59 |                 new_tgt = tuple(
60 |                     [
61 |                         item.lower()
62 |                         .replace(" ", "")
63 |                         .replace("-", "")
64 |                         .replace("\n", "")
65 |                         .replace("\t", "")
66 |                         .replace("_", "")
67 |                         .replace(".", "")
68 |                         for item in tgt
69 |                     ]
70 |                 )
71 |                 new_targets.add(new_tgt)
72 |         else:
73 |             return 0
74 | 
75 |         return jaccard_index(new_responses, new_targets)
76 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/longest_common_list_prefix_ratio.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import str_to_list
 2 | from .common.metrics import longest_common_prefix
 3 | 
 4 | 
 5 | class LongestCommonListPrefixRatio:
 6 |     """Determines how much of the first part of the list
 7 |     was predicted correctly.
 8 |     """
 9 | 
10 |     @classmethod
11 |     def match(cls, responses, targets) -> int:
12 |         """Exact match between targets and responses."""
13 |         responses = str_to_list(responses)
14 |         targets = str_to_list(targets)
15 |         return len(longest_common_prefix(responses, targets)) / len(targets)
16 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/mse.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import numpy as np
 3 | import math
 4 | from .common.metrics import mse
 5 | from .common.conversions import str_to_list
 6 | 
 7 | 
 8 | class MSE:
 9 |     """Mean Squared Error."""
10 | 
11 |     @staticmethod
12 |     def match(response: str, correct_answer: str) -> int:
13 |         """Return the mean squared error."""
14 |         try:
15 |             return mse(ast.literal_eval(response), ast.literal_eval(correct_answer))
16 |         except (SyntaxError, ValueError):
17 |             return 0
18 | 
19 | 
20 | class NormalizedRMSE:
21 |     """Mean Squared Error."""
22 | 
23 |     MIN = 0.0
24 |     MAX = 0.1
25 | 
26 |     @classmethod
27 |     def match(cls, response: str, correct_answer: str) -> int:
28 |         """Return the mean squared error."""
29 |         try:
30 |             mse_val = mse(ast.literal_eval(response), ast.literal_eval(correct_answer))
31 |             rmse = np.clip(np.sqrt(mse_val), cls.MIN, cls.MAX)
32 |             norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN)
33 |             return norm_rmse
34 |         except (SyntaxError, ValueError):
35 |             return 0
36 | 
37 | 
38 | class AngleSeqFloatRMSE:
39 |     """Whether the sequence of numbers is close enough to the real answer."""
40 | 
41 |     MIN = 0.0
42 |     MAX = 10.0
43 | 
44 |     @classmethod
45 |     def match(cls, responses, targets) -> float:
46 |         """Determines whether the sequence of floats are close enough to the real answer."""
47 |         responses = str_to_list(responses)
48 |         targets = str_to_list(targets)
49 | 
50 |         if len(responses) != len(targets):
51 |             return 0
52 | 
53 |         try:
54 |             res = np.array(responses)
55 |             tgt = np.array(targets)
56 |             rmse = np.sqrt(mse(res, tgt)).sum() / len(targets)
57 |         except:  # cannot obtain the rmse from the response, return 0
58 |             return 0
59 | 
60 |         rmse = np.clip(rmse, cls.MIN, cls.MAX)
61 |         norm_rmse = 1 - (rmse - cls.MIN) / (cls.MAX - cls.MIN)
62 |         if math.isnan(norm_rmse):
63 |             return 0
64 |         return norm_rmse
65 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/multi_ref_phrase.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | from .common.conversions import str_to_iterable
 3 | from .simple_str_match import SimpleStrMatch
 4 | 
 5 | 
 6 | def replace_potential_chinese_comma(input_string):
 7 |     return input_string.replace("，", ",")
 8 | 
 9 | 
10 | class MultipleReferencePhraseEval:
11 |     """
12 |     Check the response with multiple correct references
13 |     As long as one is matched, the score is 1, otherwise the score is 0
14 |     """
15 | 
16 |     @staticmethod
17 |     def match(response, targets) -> Number:
18 |         targets = replace_potential_chinese_comma(targets)
19 |         refs = str_to_iterable(list, targets)
20 |         matched = False
21 |         for ref in refs:
22 |             str_ref = ref if isinstance(ref, str) else str(ref)
23 |             if SimpleStrMatch.match(response, str_ref):
24 |                 matched = True
25 |                 break
26 |         return 1 if matched else 0
27 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/near_str_match.py:
--------------------------------------------------------------------------------
 1 | import rapidfuzz
 2 | import unidecode
 3 | from .common.transformations import remove_def_indef_articles
 4 | 
 5 | 
 6 | def approximate(text: str) -> str:
 7 |     """Return an approximation of the original string."""
 8 |     return unidecode.unidecode(remove_def_indef_articles(text)).lower()
 9 | 
10 | 
11 | class NearStrMatch:
12 |     """Near string matching."""
13 | 
14 |     @staticmethod
15 |     def match(response, correct_answer: str, threshold=0.9) -> int:
16 |         """Simple string match between response and correct_answer."""
17 |         if not isinstance(response, str) or not isinstance(correct_answer, str):
18 |             return 0
19 |         response = approximate(response)
20 |         correct_answer = approximate(correct_answer)
21 |         return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity(
22 |             response, correct_answer, score_cutoff=threshold
23 |         )
24 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/nli_entailment.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import pipeline
 3 | 
 4 | 
 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 6 | pipe = pipeline(
 7 |     "text-classification", model="microsoft/deberta-large-mnli", device=device
 8 | )
 9 | 
10 | 
11 | class NliEntailment:
12 |     """NLI entailment, where the correct answer is used as the premise."""
13 | 
14 |     @staticmethod
15 |     def match(response, correct_answer) -> int:
16 |         """Return whether the response and correct answer agree with each other."""
17 |         if not isinstance(response, str) or isinstance(correct_answer, str):
18 |             return 0
19 |         resp = pipe(f"[CLS] {correct_answer.strip()} [SEP] {response.strip()} [SEP]")
20 |         return 1 if resp[0]["label"] == "ENTAILMENT" else 0
21 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/normalized_similarity_damerau_levenshtein.py:
--------------------------------------------------------------------------------
 1 | import rapidfuzz
 2 | 
 3 | 
 4 | class NormalizedSimilarityDamerauLevenshtein:
 5 |     """Normalized Damerau-Levenshtein Similarity."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer) -> int:
 9 |         """Normalized indel similarityuiio do between targets and responses."""
10 |         if not isinstance(response, str) and isinstance(correct_answer, str):
11 |             return 0
12 |         return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity(
13 |             response, correct_answer
14 |         )
15 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/number_rel_diff_ratio.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import math
 3 | from numbers import Number
 4 | 
 5 | 
 6 | class NumberRelDiffRatio:
 7 |     """Number relative difference ratio scoring = min(0, 1 - |pred - gt| / gt)"""
 8 | 
 9 |     @staticmethod
10 |     def match(response: str | Number, correct_answer: str) -> int:
11 |         """Return the relative difference ratio."""
12 |         try:
13 |             if isinstance(response, Number):
14 |                 pred = response
15 |             else:
16 |                 pred = ast.literal_eval(response)
17 |             if not isinstance(pred, Number):
18 |                 return 0
19 |             gt = ast.literal_eval(correct_answer)
20 |             return max(0, 1 - math.fabs((pred - gt) / gt))
21 |         except (SyntaxError, ValueError):
22 |             return 0
23 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/positive_int_match.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | 
 4 | class PositiveIntMatch:
 5 |     """Positive int matching."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response: str, correct_answer: str) -> int:
 9 |         """If the correct answer or response is a positive integer, then it returns if the predicted and correct answers are identical.
10 | 
11 |         Otherwise, it returns -1.
12 |         """
13 |         try:
14 |             response_obj = ast.literal_eval(response)
15 |         except (SyntaxError, ValueError):
16 |             return 0
17 | 
18 |         if not correct_answer:
19 |             return 0
20 | 
21 |         correct_answer_obj = ast.literal_eval(correct_answer)
22 | 
23 |         assert isinstance(correct_answer_obj, int)
24 |         if not isinstance(response_obj, int):
25 |             return 0
26 | 
27 |         # We only want to score the fields with a positive amount
28 |         if correct_answer_obj <= 0 and response_obj <= 0:
29 |             return -1
30 | 
31 |         return 1 if response_obj == correct_answer_obj else 0
32 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/sacrebleu_bleu.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | import sacrebleu
 3 | 
 4 | 
 5 | class Bleu:
 6 |     """Compute BLEU score, using SacreBLEU."""
 7 | 
 8 |     @staticmethod
 9 |     def match(response, correct_answer) -> Number:
10 |         """Compute the BLEU scores between two strings."""
11 |         if isinstance(response, str) and isinstance(correct_answer, str):
12 |             resp = [response]
13 |             corr = [correct_answer]
14 |         elif isinstance(response, (list, tuple)) and isinstance(
15 |             correct_answer, (list, tuple)
16 |         ):
17 |             resp = tuple(response)
18 |             corr = tuple(correct_answer)
19 |         else:
20 |             return 0
21 |         result = sacrebleu.corpus_bleu(corr, [resp]).score / 100
22 |         return result
23 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/sequence_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import str_to_list
 2 | from numbers import Number
 3 | 
 4 | 
 5 | class SequenceEquality:
 6 |     """Determines how much of the first part of the list
 7 |     was predicted correctly.
 8 |     """
 9 | 
10 |     @classmethod
11 |     def match(cls, responses, targets) -> int:
12 |         """Exact match between targets and responses."""
13 |         if not isinstance(responses, str):
14 |             responses = str(responses)
15 |         responses = str_to_list(responses)
16 |         targets = str_to_list(targets)
17 |         return 1 if responses == targets else 0
18 | 
19 | 
20 | class SequenceEqualityCaseInsensitive:
21 |     """Determines how much of the first part of the list
22 |     was predicted correctly.
23 |     """
24 | 
25 |     @classmethod
26 |     def match(cls, responses, targets) -> int:
27 |         """Exact match between targets and responses."""
28 |         if not isinstance(responses, str):
29 |             responses = str(responses)
30 |         responses = str_to_list(responses)
31 |         targets = str_to_list(targets)
32 | 
33 |         responses = [
34 |             item.lower() if isinstance(item, str) else str(item) for item in responses
35 |         ]
36 |         targets = [item.lower() for item in targets]
37 |         return 1 if responses == targets else 0
38 | 
39 | 
40 | class SequenceAccuracyCaseInsensitive:
41 |     """Determines how much of the first part of the list
42 |     was predicted correctly.
43 |     """
44 | 
45 |     @classmethod
46 |     def match(cls, responses, targets) -> int:
47 |         """Exact match between targets and responses."""
48 |         responses = str_to_list(responses)
49 |         targets = str_to_list(targets)
50 |         if len(targets) != len(responses):
51 |             return 0
52 |         correct = 0
53 |         for res, tgt in zip(responses, targets):
54 |             if isinstance(tgt, str):
55 |                 if res.lower() == tgt.lower():
56 |                     correct += 1
57 |             elif isinstance(tgt, Number) and isinstance(res, Number):
58 |                 if res == tgt:
59 |                     correct += 1
60 |             else:
61 |                 pass
62 |         return correct / len(targets)
63 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/set_equality.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set, str_to_set
 2 | 
 3 | 
 4 | def _convert_to_hashable(item):
 5 |     """将不可哈希的类型转换为可哈希类型"""
 6 |     if isinstance(item, (list, tuple)):
 7 |         return tuple(item)  # 将列表转换为元组
 8 |     return item
 9 | 
10 | 
11 | class SetEquality:
12 |     """Determines whether two sets are equal."""
13 | 
14 |     @classmethod
15 |     def match(cls, responses, targets) -> int:
16 |         """Exact match between targets and responses."""
17 |         if isinstance(responses, (list, tuple)):
18 |             responses = {_convert_to_hashable(item) for item in responses}
19 |         if isinstance(targets, (list, tuple)):
20 |             targets = {_convert_to_hashable(item) for item in targets}
21 |         return 1 if responses == targets else 0
22 | 
23 | 
24 | class SetEqualityCaseInsensitive:
25 |     """Determines whether two sets are equal, ignoring string case."""
26 | 
27 |     @classmethod
28 |     def match(cls, responses, targets) -> int:
29 |         """Exact match between targets and responses."""
30 |         try:
31 |             responses: set[str] = {text.upper() for text in cast_to_set(responses)}
32 |             targets: set[str] = {text.upper() for text in cast_to_set(targets)}
33 |         except AttributeError:
34 |             return 0
35 |         return 1 if responses == targets else 0
36 | 
37 | 
38 | class StringSetEqualityLineSplit:
39 |     """Determines whether two sets are equal, for string inputs, separated by line breaks"""
40 | 
41 |     @classmethod
42 |     def match(cls, responses, targets) -> int:
43 |         if "\\n" in targets:
44 |             targets = targets.replace("\\n", "\n")
45 |         if "\\n" in responses:
46 |             responses = responses.replace("\\n", "\n")
47 |         responses_set = set(responses.split("\n"))
48 |         targets_set = set(targets.split("\n"))
49 |         responses_set = {
50 |             item.lower() if isinstance(item, str) else item for item in responses_set
51 |         }
52 |         targets_set = {
53 |             item.lower() if isinstance(item, str) else item for item in targets_set
54 |         }
55 |         return 1 if responses_set == targets_set else 0
56 | 
57 | 
58 | class StringSetEqualityCommaSplit:
59 |     """Determines whether two sets are equal, for string inputs, separated by commas
60 |     Handles some corner cases that would fail the general SetEquality metric, like the string
61 |     with "None", which fails the eval. Also do case-insensitive eval.
62 |     """
63 | 
64 |     @classmethod
65 |     def match(cls, responses, targets) -> int:
66 |         responses_set = str_to_set(responses)
67 |         targets_set = str_to_set(targets)
68 |         responses_set = {
69 |             item.lower() if isinstance(item, str) else item for item in responses_set
70 |         }
71 |         targets_set = {
72 |             item.lower() if isinstance(item, str) else item for item in targets_set
73 |         }
74 |         return 1 if responses_set == targets_set else 0
75 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/set_precision.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import cast_to_set
 2 | from .common.metrics import set_precision
 3 | 
 4 | 
 5 | class SetPrecision:
 6 |     """Calculates the set precision for iterables."""
 7 | 
 8 |     @classmethod
 9 |     def match(cls, responses, targets) -> float:
10 |         """Exact match between targets and responses."""
11 |         if responses is None:
12 |             return 0
13 |         responses = cast_to_set(responses)
14 |         targets = cast_to_set(targets)
15 | 
16 |         return set_precision(responses, targets)
17 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/simple_str_match.py:
--------------------------------------------------------------------------------
 1 | from .exact_str_match import ExactStrMatch
 2 | 
 3 | 
 4 | class SimpleStrMatch:
 5 |     """Basic string matching, without spaces or hyphens."""
 6 | 
 7 |     @staticmethod
 8 |     def match(response, correct_answer: str) -> int:
 9 |         """Simple string match between response and correct_answer."""
10 |         if not isinstance(response, str):
11 |             response = str(response)  # If it is JSON-like
12 |         response = (
13 |             response.replace(" ", "")
14 |             .replace("-", "")
15 |             .replace("\n", "")
16 |             .replace("\t", "")
17 |             .replace(".", "")
18 |             .lower()
19 |         )
20 |         correct_answer = (
21 |             correct_answer.replace(" ", "")
22 |             .replace("-", "")
23 |             .replace("\n", "")
24 |             .replace("\t", "")
25 |             .replace(".", "")
26 |             .lower()
27 |         )
28 | 
29 |         return ExactStrMatch.match(response, correct_answer)
30 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/unsupported_scoring.py:
--------------------------------------------------------------------------------
1 | class UnsupportedScoring:
2 |     """Unsupported scoring."""
3 | 
4 |     @staticmethod
5 |     def match(response: str, correct_answer: str) -> int:
6 |         """Default response for unimplemented metrics."""
7 |         return -1
8 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_nbbox_iou.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .common.metrics import calculate_iou
 3 | from .common.conversions import parse_bboxes_from_xml
 4 | from numbers import Number
 5 | 
 6 | 
 7 | class XmlNbboxIouSingle:
 8 |     """Calculates the IoU of bounding box.
 9 | 
10 |     Assumes that co-ordinates are normalized between 0 and 1 and that the bounding boxes
11 |     are of the form <box>top_left_x, top_left_y, bottom_right_x, bottom_right_y</box>
12 |     """
13 | 
14 |     @classmethod
15 |     def match(cls, responses, targets) -> float:
16 | 
17 |         logging.debug(f"{responses=}, {targets=}")
18 |         if not isinstance(responses, (tuple | list)):
19 |             responses = parse_bboxes_from_xml(responses)
20 |         if not isinstance(targets, (tuple | list)):
21 |             targets = parse_bboxes_from_xml(targets)
22 | 
23 |         if len(responses) == 0:
24 |             return 0
25 |         elif isinstance(responses[0], Number) and len(responses) == 4:
26 |             responses = [responses]
27 | 
28 |         iou_scores = calculate_iou(responses, targets)
29 |         if not iou_scores:
30 |             return 0
31 | 
32 |         # Take the mean IoU score for now.
33 |         return sum(iou_scores) / len(iou_scores)
34 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_norm_point_distance.py:
--------------------------------------------------------------------------------
 1 | """Return the normalized point distance."""
 2 | 
 3 | from .common.conversions import parse_point_2d_from_xml
 4 | from .common.metrics import point_distance
 5 | 
 6 | 
 7 | class XmlNormPointDistance:
 8 |     """Determines the distance between two points in XML notation.
 9 | 
10 |     Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is
11 |     of the form <point>x, y</point>.
12 |     """
13 | 
14 |     @classmethod
15 |     def parse_2d_point(cls, point) -> tuple[float, float]:
16 |         """Parse a 2D point encoded in XML as <point>x, y</point>."""
17 |         if not isinstance(point, (tuple | list)):
18 |             point = parse_point_2d_from_xml(point)
19 |             if not point:
20 |                 raise ValueError("Point could not be parsed from XML string.")
21 |         elif len(point) != 2:
22 |             raise ValueError("Point is not 2D.")
23 |         if not all(0 <= comp <= 1 for comp in point):
24 |             raise ValueError("Point is not normalized.")
25 |         return tuple(point)
26 | 
27 |     @classmethod
28 |     def match(cls, responses, targets) -> float:
29 |         """Determine the normalized distance between two points."""
30 |         try:
31 |             responses = cls.parse_2d_point(responses)
32 |             targets = cls.parse_2d_point(targets)
33 |         except ValueError:
34 |             return 0
35 | 
36 |         # Instead of normalizing by 1/sqrt(2), we just set it to 0 if the distance is above 1.
37 |         return max(0, 1 - point_distance(responses, targets))
38 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/scoring/xml_norm_point_in_bbox.py:
--------------------------------------------------------------------------------
 1 | from .common.conversions import parse_point_2d_from_xml, str_to_bboxes
 2 | 
 3 | 
 4 | class XmlNormPointInBbox:
 5 |     """Determines whether a point is located in a bounding box.
 6 | 
 7 |     Assumes that co-ordinates are normalized between 0 and 1 and that the 2D point is
 8 |     of the form <point>x, y</point>
 9 |     """
10 | 
11 |     @classmethod
12 |     def match(cls, responses, eval_context) -> int:
13 |         """Determine if the point is in the bounding box
14 |         and return which bounding box was matched, if any."""
15 |         bounding_box_has_match = {
16 |             bbox: False for bbox in eval_context["bounding_boxes"]
17 |         }
18 |         bounding_boxes = [
19 |             str_to_bboxes(bbox_str)[0] for bbox_str in eval_context["bounding_boxes"]
20 |         ]
21 |         assert bounding_boxes
22 | 
23 |         if not isinstance(responses, (tuple | list)):
24 |             responses = parse_point_2d_from_xml(responses)
25 |             if not responses:
26 |                 return 0, bounding_box_has_match
27 |         elif len(responses) != 2:
28 |             return 0, bounding_box_has_match
29 | 
30 |         x, y = responses
31 |         for min_x, min_y, max_x, max_y in bounding_boxes:
32 |             if min_x <= x <= max_x and min_y <= y <= max_y:
33 |                 bounding_box_has_match[str((min_x, min_y, max_x, max_y))] = True
34 |                 return 1, bounding_box_has_match
35 |         return 0, bounding_box_has_match
36 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/megabench/utils.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from mimetypes import guess_type
 3 | 
 4 | 
 5 | def lazy_import(module_name, class_name):
 6 |     """Import the module lazily."""
 7 | 
 8 |     def importer():
 9 |         module = importlib.import_module(module_name)
10 |         return getattr(module, class_name)
11 | 
12 |     return importer
13 | 
14 | 
15 | def is_video_file(file_path):
16 |     mime_type, _ = guess_type(file_path)
17 |     if not mime_type:
18 |         return False
19 |     return mime_type.startswith("video")
20 | 
21 | 
22 | 
23 | def prepare_megabench_data(dataset_name, dataset_subset_name):
24 |     """
25 |         Prepare the MEGA-Bench dataset for evaluation.
26 |         Return:
27 |             subset_dataset: The organized data of the specified subset
28 |             all_dataset: The organized data of all tasks, used for evaluation
29 |     """
30 |     from datasets import load_dataset
31 |     if "single_image" in dataset_subset_name:
32 |         core_data = load_dataset(dataset_name, "core_single_image")
33 |         open_data = load_dataset(dataset_name, "open_single_image")
34 |     else:
35 |         core_data = load_dataset(dataset_name, "core")
36 |         open_data = load_dataset(dataset_name, "open")
37 |     core_test_samples = list(core_data["test"])
38 |     organized_core_dataset = organize_hf_dataset(core_test_samples)
39 |     open_test_samples = list(open_data["test"])
40 |     organized_open_dataset = organize_hf_dataset(open_test_samples)
41 |     subset_dataset = organized_core_dataset if "core" in dataset_subset_name else organized_open_dataset
42 |     all_dataset = organized_core_dataset + organized_open_dataset
43 |     return subset_dataset, all_dataset
44 | 
45 | 
46 | def organize_hf_dataset(dataset):
47 |     """
48 |     Organize the dataset with task-based manner
49 | 
50 |     Return:
51 |         organized_dataset: list, each item is a dict, with the following keys:
52 |             - task_name: str
53 |             - task_query_samples: list of dicts, each dict contains the sample information
54 |     """
55 |     task_dict = {}
56 |     for sample in dataset:
57 |         task_name = sample["task_name"]
58 |         if task_name not in task_dict:
59 |             task_dict[task_name] = []
60 |         task_dict[task_name].append(sample)
61 | 
62 |     organized_dataset = []
63 |     for task_name, samples in task_dict.items():
64 |         organized_dataset.append({
65 |             "task_name": task_name,
66 |             "task_samples": samples
67 |         })
68 | 
69 |     return organized_dataset
70 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/mmif/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/utils/mmif/__init__.py


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/mmsci4eval_req.txt:
--------------------------------------------------------------------------------
1 | evaluate
2 | pycocoevalcap
3 | bert_score
4 | rouge_score
5 | nltk
6 | absl-py
7 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/ocrbench.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | 
 3 | 
 4 | def OCRBench_eval(eval_file):
 5 |     OCRBench_score = {
 6 |         'Regular Text Recognition': 0,
 7 |         'Irregular Text Recognition': 0,
 8 |         'Artistic Text Recognition': 0,
 9 |         'Handwriting Recognition': 0,
10 |         'Digit String Recognition': 0,
11 |         'Non-Semantic Text Recognition': 0,
12 |         'Scene Text-centric VQA': 0,
13 |         'Doc-oriented VQA': 0,
14 |         'Key Information Extraction': 0,
15 |         'Handwritten Mathematical Expression Recognition': 0
16 |     }
17 | 
18 |     logger = get_logger('Evaluation')
19 | 
20 |     data = load(eval_file)
21 |     lt = len(data)
22 |     lines = [data.iloc[i] for i in range(lt)]
23 |     for i in tqdm(range(len(lines))):
24 |         line = lines[i]
25 |         predict = str(line['prediction'])
26 |         answers = eval(line['answer'])
27 |         category = line['category']
28 |         if category == 'Handwritten Mathematical Expression Recognition':
29 |             for j in range(len(answers)):
30 |                 answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31 |                 predict = predict.strip().replace('\n', ' ').replace(' ', '')
32 |                 if answer in predict:
33 |                     OCRBench_score[category] += 1
34 |                     break
35 |         else:
36 |             for j in range(len(answers)):
37 |                 answer = answers[j].lower().strip().replace('\n', ' ')
38 |                 predict = predict.lower().strip().replace('\n', ' ')
39 |                 if answer in predict:
40 |                     OCRBench_score[category] += 1
41 |                     break
42 | 
43 |     final_score_dict = {}
44 |     final_score_dict['Text Recognition'] = (
45 |         OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46 |         + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47 |         + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48 |     )
49 |     final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50 |     final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51 |     final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52 |     final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53 |         OCRBench_score['Handwritten Mathematical Expression Recognition']
54 |     final_score_dict['Final Score'] = (
55 |         final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56 |         + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57 |         + final_score_dict['Handwritten Mathematical Expression Recognition']
58 |     )
59 |     final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60 |     score_pth = eval_file.replace('.xlsx', '_score.json')
61 |     dump(final_score_dict, score_pth)
62 |     logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63 |     logger.info('Score: ')
64 |     for key, value in final_score_dict.items():
65 |         logger.info('{}:{}'.format(key, value))
66 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/qbench_video.py:
--------------------------------------------------------------------------------
 1 | from ...smp import *
 2 | from .multiple_choice import extract_answer_from_item
 3 | from PIL import Image, ImageOps
 4 | import numpy as np
 5 | 
 6 | FAIL_MSG = 'Failed to obtain answer via API.'
 7 | 
 8 | VQA_JUDGE_SYS_PROMPT = """
 9 | You are a helpful assistant that grades answers related to visual video quality.
10 | There are a lot of special terms or keywords related to video processing and photography.
11 | You will pay attention to the context of `quality evaluation' when grading.
12 | """
13 | 
14 | VQA_JUDGE_USER_PROMPT = """
15 | Given the question {}, evaluate whether the response {} completely matches the correct answer {}.
16 | First, check the response and please rate score 0 if the response is not a valid answer.
17 | Please rate score 2 if the response completely or almost completely matches the correct answer on completeness, accuracy, and relevance.
18 | Please rate score 1 if the response partly matches the correct answer on completeness, accuracy, and relevance.
19 | Please rate score 0 if the response doesn't match the correct answer on completeness, accuracy, and relevance at all.
20 | Please only provide the result in the following format: Score:'
21 | """  # noqa: E501
22 | 
23 | 
24 | def check_ans_mcq(pred, ans, correct_choice, correct_answer):
25 |     flag = False
26 | 
27 |     if correct_choice == pred or correct_choice + "." in pred or correct_answer == pred:
28 |         flag = True
29 |     elif correct_choice in pred.split("\n"):
30 |         flag = True
31 | 
32 |     return flag
33 | 
34 | 
35 | def check_ans_vqa(model, line):
36 |     score = model.generate(VQA_JUDGE_USER_PROMPT.format(line['question'], line['prediction'], line['answer'])).strip()
37 |     return score
38 | 
39 | 
40 | def get_dimension_rating(score_file):
41 |     score = load(score_file)
42 |     result_dict = {}
43 |     for idx, item in score.iterrows():
44 |         question_type = eval(item['dimensions'])[0].split(',')[0]
45 |         if question_type not in result_dict:
46 |             result_dict[question_type] = [0, 0]
47 |         result_dict[question_type][0] += int(item['score'])
48 |         result_dict[question_type][1] += 1
49 |     return result_dict
50 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/dataset/utils/vgrpbench/__init__.py


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/coloredsudoku.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Union, Dict, Any
 2 | import random
 3 | import copy
 4 | import os
 5 | import json
 6 | 
 7 | from .common_puzzle_factory import PuzzleFactory
 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat
 9 | 
10 | class ConstraintColorNoRepeat(Constraint):
11 |     def __init__(self) -> None:
12 |         super().__init__()
13 |         self.name = "constraint_color_no_repeat"
14 | 
15 |     def check(self, game_state: Dict[str, Any]) -> bool:
16 |         board = game_state["board"]
17 |         colors = game_state.get("colors", None)
18 | 
19 |         # If no colors are specified, skip this constraint
20 |         if colors is None:
21 |             return True
22 | 
23 |         color_groups = {}
24 |         for i in range(len(board)):
25 |             for j in range(len(board[0])):
26 |                 color = colors[i][j]
27 |                 if color not in color_groups:
28 |                     color_groups[color] = []
29 |                 if board[i][j] != 0:
30 |                     color_groups[color].append(board[i][j])
31 |         for color_values in color_groups.values():
32 |             if len(set(color_values)) != len(color_values):
33 |                 return False
34 |         return True
35 | 
36 | class ColoredSudokuPuzzleFactory(PuzzleFactory):
37 |     def __init__(self, size: int) -> None:
38 |         super().__init__()
39 |         self.game_name = "coloredsudoku"
40 |         self.size = size
41 |         self.constraints = [
42 |             ConstraintRowNoRepeat(),
43 |             ConstraintColNoRepeat(),
44 |             ConstraintColorNoRepeat()
45 |         ]
46 |         self.all_possible_values = [i for i in range(1, size + 1)]
47 |         self.colors = [chr(65 + i) for i in range(size)]
48 | 
49 |     def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]:
50 |         possible_values = []
51 |         board = game_state["board"]
52 |         original_value = board[row][col]
53 |         for value in self.all_possible_values:
54 |             board[row][col] = value
55 |             if self.check(game_state):
56 |                 possible_values.append(value)
57 |         board[row][col] = original_value
58 |         return possible_values
59 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/common_constriants.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any
 2 | 
 3 | class Constraint():
 4 |     def __init__(self) -> None:
 5 |         self.name = ""
 6 |     def check(self, game_state: Dict[str, Any]) -> bool:
 7 |         pass
 8 | 
 9 | class ConstraintRowNoRepeat(Constraint):
10 |     def __init__(self) -> None:
11 |         super().__init__()
12 |         self.name = "constraint_row_no_repeat"
13 |     def check(self, game_state: Dict[str, Any]) -> bool:
14 |         board = game_state["board"]
15 |         for row in board:
16 |             row_tmp = [cell for cell in row if cell != 0]
17 |             if len(set(row_tmp)) != len(row_tmp):
18 |                 return False
19 |         return True
20 | 
21 | class ConstraintColNoRepeat(Constraint):
22 |     def __init__(self) -> None:
23 |         super().__init__()
24 |         self.name = "constraint_col_no_repeat"
25 |     def check(self, game_state: Dict[str, Any]) -> bool:
26 |         board = game_state["board"]
27 |         for col in range(len(board[0])):
28 |             col_tmp = [board[row][col] for row in range(len(board)) if board[row][col] != 0]
29 |             if len(set(col_tmp)) != len(col_tmp):
30 |                 return False
31 |         return True
32 | 
33 | class ConstraintSubGridNoRepeat(Constraint):
34 |     def __init__(self) -> None:
35 |         super().__init__()
36 |         self.name = "constraint_sub_grid_no_repeat"
37 |     def check(self, game_state: Dict[str, Any]) -> bool:
38 |         board = game_state["board"]
39 |         assert len(board) == len(board[0]), "board is not square"
40 |         assert len(board) in [4, 9], "board size is not 4 or 9"
41 | 
42 |         sub_grid_size = int(len(board) ** 0.5)
43 |         for i in range(0, len(board), sub_grid_size):
44 |             for j in range(0, len(board[0]), sub_grid_size):
45 |                 sub_grid = [
46 |                     board[x][y] for x in range(i, i + sub_grid_size)
47 |                     for y in range(j, j + sub_grid_size)
48 |                     if board[x][y] != 0
49 |                 ]
50 |                 if len(set(sub_grid)) != len(sub_grid):
51 |                     return False
52 |         return True
53 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/common_get_game_factory.py:
--------------------------------------------------------------------------------
 1 | def get_game_factory(game_type):
 2 |     if game_type == "sudoku":
 3 |         from .sudoku import SudokuPuzzleFactory as GameFactory
 4 |     elif game_type == "binairo":
 5 |         from .binairo import BinairoPuzzleFactory as GameFactory
 6 |     elif game_type == "coloredsudoku":
 7 |         from .coloredsudoku import ColoredSudokuPuzzleFactory as GameFactory
 8 |     elif game_type == "kakuro":
 9 |         from .kakuro import KakuroPuzzleFactory as GameFactory
10 |     elif game_type == "killersudoku":
11 |         from .killersudoku import KillerSudokuPuzzleFactory as GameFactory
12 |     elif game_type == "renzoku":
13 |         from .renzoku import RenzokuPuzzleFactory as GameFactory
14 |     elif game_type == "skyscraper":
15 |         from .skyscraper import SkyscraperPuzzleFactory as GameFactory
16 |     elif game_type == "starbattle":
17 |         from .starbattle import StarBattlePuzzleFactory as GameFactory
18 |     elif game_type == "treesandtents":
19 |         from .treesandtents import TreesAndTentsPuzzleFactory as GameFactory
20 |     elif game_type == "thermometers":
21 |         from .thermometers import ThermometersPuzzleFactory as GameFactory
22 |     elif game_type == "futoshiki":
23 |         from .futoshiki import FutoshikiPuzzleFactory as GameFactory
24 |     elif game_type == "hitori":
25 |         from .hitori import HitoriPuzzleFactory as GameFactory
26 |     elif game_type == "aquarium":
27 |         from .aquarium import AquariumPuzzleFactory as GameFactory
28 |     elif game_type == "kakurasu":
29 |         from .kakurasu import KakurasuPuzzleFactory as GameFactory
30 |     elif game_type == "oddevensudoku":
31 |         from .oddevensudoku import OddEvenSudokuPuzzleFactory as GameFactory
32 |     elif game_type == "battleships":
33 |         from .battleships import BattleshipsPuzzleFactory as GameFactory
34 |     elif game_type == "fieldexplore":
35 |         from .fieldexplore import FieldExplorePuzzleFactory as GameFactory
36 |     elif game_type == "jigsawsudoku":
37 |         from .jigsawsudoku import JigsawSudokuPuzzleFactory as GameFactory
38 |     elif game_type == "lightup":
39 |         from .lightup import LightUpPuzzleFactory as GameFactory
40 |     elif game_type == "nonogram":
41 |         from .nonogram import NonogramPuzzleFactory as GameFactory
42 | 
43 |     return GameFactory
44 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/common_get_prompt.py:
--------------------------------------------------------------------------------
 1 | def get_prompt(game_type: str, thinking_format: str) -> str:
 2 |     if game_type == "sudoku":
 3 |         from puzzles.sudoku import SYSTEM_PROMPT
 4 |     elif game_type == "coloredsudoku":
 5 |         from puzzles.coloredsudoku import SYSTEM_PROMPT
 6 |     elif game_type == "binairo":
 7 |         from puzzles.binairo import SYSTEM_PROMPT
 8 |     elif game_type == "futoshiki":
 9 |         from puzzles.futoshiki import SYSTEM_PROMPT
10 |     elif game_type == "hitori":
11 |         from puzzles.hitori import SYSTEM_PROMPT
12 |     elif game_type == "kakuro":
13 |         from puzzles.kakuro import SYSTEM_PROMPT
14 |     elif game_type == "killersudoku":
15 |         from puzzles.killersudoku import SYSTEM_PROMPT
16 |     elif game_type == "renzoku":
17 |         from puzzles.renzoku import SYSTEM_PROMPT
18 |     elif game_type == "skyscraper":
19 |         from puzzles.skyscraper import SYSTEM_PROMPT
20 |     elif game_type == "starbattle":
21 |         from puzzles.starbattle import SYSTEM_PROMPT
22 |     elif game_type == "sudoku":
23 |         from puzzles.sudoku import SYSTEM_PROMPT
24 |     elif game_type == "treesandtents":
25 |         from puzzles.treesandtents import SYSTEM_PROMPT
26 |     elif game_type == "thermometers":
27 |         from puzzles.thermometers import SYSTEM_PROMPT
28 |     elif game_type == "kakurasu":
29 |         from puzzles.kakurasu import SYSTEM_PROMPT
30 |     elif game_type == "aquarium":
31 |         from puzzles.aquarium import SYSTEM_PROMPT
32 |     elif game_type == "oddevensudoku":
33 |         from puzzles.oddevensudoku import SYSTEM_PROMPT
34 | 
35 |     elif game_type == "battleships":
36 |         from puzzles.battleships import SYSTEM_PROMPT
37 |     elif game_type == "fieldexplore":
38 |         from puzzles.fieldexplore import SYSTEM_PROMPT
39 |     elif game_type == "jigsawsudoku":
40 |         from puzzles.jigsawsudoku import SYSTEM_PROMPT
41 |     elif game_type == "nonogram":
42 |         from puzzles.nonogram import SYSTEM_PROMPT
43 |     elif game_type == "lightup":
44 |         from puzzles.lightup import SYSTEM_PROMPT
45 | 
46 |     else:
47 |         raise ValueError(f"Unknown game type: {game_type}")
48 | 
49 |     if thinking_format == "direct_solution":
50 |         return SYSTEM_PROMPT["direct_solution"]
51 |     else:
52 |         return SYSTEM_PROMPT["cot"]
53 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/fieldexplore.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List, Dict, Any, Tuple
 3 | import random
 4 | import copy
 5 | import os
 6 | import argparse
 7 | 
 8 | from .common_puzzle_factory import PuzzleFactory
 9 | from .common_constriants import Constraint
10 | 
11 | class ConstraintAdjacentNumbers(Constraint):
12 |     """Ensures revealed numbers match adjacent mine counts"""
13 |     def check(self, game_state: List[List[Any]]) -> bool:
14 | 
15 |         board = game_state["board"]
16 | 
17 |         size = len(board)
18 |         for i in range(size):
19 |             for j in range(size):
20 |                 if isinstance(board[i][j], int) and board[i][j] != 0:  # If cell is a revealed number
21 |                     # Count adjacent mines and undefined cells
22 |                     i_start = max(0, i-1)
23 |                     i_end = min(size, i+2)
24 |                     j_start = max(0, j-1)
25 |                     j_end = min(size, j+2)
26 | 
27 |                     adjacent_mines = sum(1 for r in range(i_start, i_end)
28 |                                       for c in range(j_start, j_end)
29 |                                       if board[r][c] == 's')
30 | 
31 |                     adjacent_undefined = sum(1 for r in range(i_start, i_end)
32 |                                           for c in range(j_start, j_end)
33 |                                           if board[r][c] == 0)
34 | 
35 |                     # Check if current mines <= number <= potential mines (current + undefined)
36 |                     if adjacent_mines > board[i][j] or adjacent_mines + adjacent_undefined < board[i][j]:
37 |                         return False
38 |         return True
39 | 
40 | class FieldExplorePuzzleFactory(PuzzleFactory):
41 |     def __init__(self, size: int) -> None:
42 |         super().__init__()
43 |         self.game_name = "fieldexplore"
44 |         self.size = size
45 |         self.constraints = [ConstraintAdjacentNumbers()]
46 |         self.all_possible_values = ['s', 'e']  # True for 's', False for 'e'
47 | 
48 |     def check(self, board: List[List[Any]]) -> bool:
49 |         for constraint in self.constraints:
50 |             if not constraint.check(board):
51 |                 return False
52 |         return True
53 | 
54 |     def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]:
55 |         possible_values = []
56 |         board = game_state["board"]
57 |         original_value = board[row][col]
58 |         for value in self.all_possible_values:
59 |             board[row][col] = value
60 |             if self.check(game_state):
61 |                 possible_values.append(value)
62 |         board[row][col] = original_value
63 |         return possible_values
64 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/jigsawsudoku.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Union, Dict, Any
 2 | import random
 3 | import copy
 4 | import os
 5 | import json
 6 | 
 7 | from .common_puzzle_factory import PuzzleFactory
 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat
 9 | 
10 | class ConstraintRegionNoRepeat(Constraint):
11 |     def __init__(self) -> None:
12 |         super().__init__()
13 |         self.name = "constraint_region_no_repeat"
14 | 
15 |     def check(self, game_state: Dict[str, Any]) -> bool:
16 |         board = game_state["board"]
17 |         regions = game_state.get("regions", None)
18 | 
19 |         if regions is None:
20 |             return True
21 | 
22 |         region_groups = {}
23 |         for i in range(len(board)):
24 |             for j in range(len(board[0])):
25 |                 region = regions[i][j]
26 |                 if region not in region_groups:
27 |                     region_groups[region] = []
28 |                 if board[i][j] != 0:
29 |                     region_groups[region].append(board[i][j])
30 |         for region_values in region_groups.values():
31 |             if len(set(region_values)) != len(region_values):
32 |                 return False
33 |         return True
34 | 
35 | class JigsawSudokuPuzzleFactory(PuzzleFactory):
36 |     def __init__(self, size: int) -> None:
37 |         super().__init__()
38 |         self.game_name = "jigsawsudoku"
39 |         self.size = size
40 |         self.constraints = [
41 |             ConstraintRowNoRepeat(),
42 |             ConstraintColNoRepeat(),
43 |             ConstraintRegionNoRepeat()
44 |         ]
45 |         self.all_possible_values = [i for i in range(1, size + 1)]
46 |         self.cached_region_splits = []
47 | 
48 |     def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]:
49 |         """Get possible values for a cell based on row, column, and region constraints."""
50 |         if game_state["board"][row][col] != 0:
51 |             return []
52 |         possible_values = []
53 |         for value in self.all_possible_values:
54 |             # Try the value
55 |             original_value = game_state["board"][row][col]
56 |             game_state["board"][row][col] = value
57 |             # Check if it's valid according to all constraints
58 |             valid = True
59 |             for constraint in self.constraints:
60 |                 if not constraint.check(game_state):
61 |                     valid = False
62 |                     break
63 | 
64 |             # Restore original value
65 |             game_state["board"][row][col] = original_value
66 | 
67 |             if valid:
68 |                 possible_values.append(value)
69 | 
70 |         return possible_values
71 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/killersudoku.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Any, Tuple
 2 | import random
 3 | import copy
 4 | import os
 5 | import argparse
 6 | 
 7 | from .common_puzzle_factory import PuzzleFactory
 8 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat
 9 | 
10 | class ConstraintCageSum(Constraint):
11 |     def __init__(self) -> None:
12 |         super().__init__()
13 |         self.name = "constraint_cage_sum"
14 | 
15 |     def check(self, game_state: Dict[str, Any]) -> bool:
16 |         board = game_state["board"]
17 |         cages = game_state.get("cages", [])  # Default to empty list if no cages
18 | 
19 |         for cage in cages:
20 |             cells = cage["cells"]
21 |             target_sum = cage["sum"]
22 |             current_sum = 0
23 |             for row, col in cells:
24 |                 if board[row][col] == 0:  # Skip empty cells
25 |                     continue
26 |                 current_sum += board[row][col]
27 |             if current_sum > target_sum:  # Can't exceed target sum
28 |                 return False
29 |             # Only check equality if all cells in cage are filled
30 |             if all(board[row][col] != 0 for row, col in cells) and current_sum != target_sum:
31 |                 return False
32 |         return True
33 | 
34 | class KillerSudokuPuzzleFactory(PuzzleFactory):
35 |     def __init__(self, size: int) -> None:
36 |         super().__init__()
37 |         self.game_name = "killersudoku"
38 |         self.size = size
39 |         self.constraints = [
40 |             ConstraintRowNoRepeat(),
41 |             ConstraintColNoRepeat(),
42 |             ConstraintSubGridNoRepeat(),
43 |             ConstraintCageSum()
44 |         ]
45 |         self.all_possible_values = [i for i in range(1, size + 1)]
46 | 
47 |     def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]:
48 |         possible_values = []
49 |         board = game_state["board"]
50 |         original_value = board[row][col]
51 | 
52 |         # Ensure cages exist in game_state
53 |         if "cages" not in game_state:
54 |             game_state["cages"] = []
55 | 
56 |         for value in self.all_possible_values:
57 |             board[row][col] = value
58 |             if self.check(game_state):
59 |                 possible_values.append(value)
60 |         board[row][col] = original_value
61 |         return possible_values
62 | 


--------------------------------------------------------------------------------
/vlmeval/dataset/utils/vgrpbench/puzzles/sudoku.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Union, Dict, Any
 2 | import random
 3 | import copy
 4 | from abc import ABC, abstractmethod
 5 | import os
 6 | import json
 7 | import argparse
 8 | 
 9 | from .common_puzzle_factory import PuzzleFactory
10 | from .common_constriants import Constraint, ConstraintRowNoRepeat, ConstraintColNoRepeat, ConstraintSubGridNoRepeat
11 | 
12 | class SudokuPuzzleFactory(PuzzleFactory):
13 |     def __init__(self, size: int) -> None:
14 |         super().__init__()
15 |         self.game_name = "sudoku"
16 |         self.size = size
17 | 
18 |         self.constraints.append(ConstraintRowNoRepeat())
19 |         self.constraints.append(ConstraintColNoRepeat())
20 |         self.constraints.append(ConstraintSubGridNoRepeat())
21 | 
22 |         self.all_possible_values = [i for i in range(1, size + 1)]
23 |     def get_possible_values(self, game_state: Dict[str, Any], row: int, col: int) -> List[int]:
24 |         possible_values = []
25 |         board = game_state["board"]
26 |         original_value = board[row][col]
27 |         for value in self.all_possible_values:
28 |             board[row][col] = value
29 |             if self.check(game_state):
30 |                 possible_values.append(value)
31 |         board[row][col] = original_value
32 |         return possible_values
33 | 


--------------------------------------------------------------------------------
/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 | 


--------------------------------------------------------------------------------
/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logging.basicConfig(
 3 |     format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
 4 |     datefmt='%Y-%m-%d %H:%M:%S')
 5 | 
 6 | logger_initialized = {}
 7 | 
 8 | 
 9 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
10 |     logger = logging.getLogger(name)
11 |     if name in logger_initialized:
12 |         return logger
13 | 
14 |     for logger_name in logger_initialized:
15 |         if name.startswith(logger_name):
16 |             return logger
17 | 
18 |     stream_handler = logging.StreamHandler()
19 |     handlers = [stream_handler]
20 | 
21 |     try:
22 |         import torch.distributed as dist
23 |         if dist.is_available() and dist.is_initialized():
24 |             rank = dist.get_rank()
25 |         else:
26 |             rank = 0
27 |     except ImportError:
28 |         rank = 0
29 | 
30 |     if rank == 0 and log_file is not None:
31 |         file_handler = logging.FileHandler(log_file, file_mode)
32 |         handlers.append(file_handler)
33 | 
34 |     formatter = logging.Formatter(
35 |         '[%(asctime)s] %(levelname)s - %(name)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s',
36 |         datefmt='%Y-%m-%d %H:%M:%S')
37 |     for handler in handlers:
38 |         handler.setFormatter(formatter)
39 |         handler.setLevel(log_level)
40 |         logger.propagate = False
41 |         logger.addHandler(handler)
42 | 
43 |     if rank == 0:
44 |         logger.setLevel(log_level)
45 |     else:
46 |         logger.setLevel(logging.ERROR)
47 | 
48 |     logger_initialized[name] = True
49 |     return logger
50 | 


--------------------------------------------------------------------------------
/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .matching_util import can_infer, can_infer_option, can_infer_text, can_infer_sequence, can_infer_lego
2 | from .mp_util import track_progress_rich
3 | 
4 | 
5 | __all__ = [
6 |     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 'can_infer_sequence', 'can_infer_lego',
7 | ]
8 | 


--------------------------------------------------------------------------------
/vlmeval/utils/mp_util.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | import os
 3 | from typing import Callable, Iterable, Sized
 4 | 
 5 | from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
 6 |                            TaskProgressColumn, TextColumn, TimeRemainingColumn)
 7 | from rich.text import Text
 8 | import os.path as osp
 9 | import time
10 | import portalocker
11 | from ..smp import load, dump
12 | 
13 | 
14 | def track_progress_rich(
15 |         func: Callable,
16 |         tasks: Iterable = tuple(),
17 |         nproc: int = 1,
18 |         save=None,
19 |         keys=None,
20 |         **kwargs) -> list:
21 | 
22 |     from concurrent.futures import ThreadPoolExecutor
23 |     from tqdm import tqdm
24 |     if save is not None:
25 |         assert osp.exists(osp.dirname(save)) or osp.dirname(save) == ''
26 |         if not osp.exists(save):
27 |             dump({}, save)
28 |     if keys is not None:
29 |         assert len(keys) == len(tasks)
30 |     if not callable(func):
31 |         raise TypeError('func must be a callable object')
32 |     if not isinstance(tasks, Iterable):
33 |         raise TypeError(
34 |             f'tasks must be an iterable object, but got {type(tasks)}')
35 |     assert nproc > 0, 'nproc must be a positive number'
36 |     res = load(save) if save is not None else {}
37 |     results = [None for _ in range(len(tasks))]
38 | 
39 |     with ThreadPoolExecutor(max_workers=nproc) as executor:
40 |         futures = []
41 | 
42 |         for inputs in tasks:
43 |             if not isinstance(inputs, (tuple, list, dict)):
44 |                 inputs = (inputs, )
45 |             if isinstance(inputs, dict):
46 |                 future = executor.submit(func, **inputs)
47 |             else:
48 |                 future = executor.submit(func, *inputs)
49 |             futures.append(future)
50 | 
51 |         unfinished = set(range(len(tasks)))
52 |         pbar = tqdm(total=len(unfinished))
53 |         while len(unfinished):
54 |             new_finished = set()
55 |             for idx in unfinished:
56 |                 if futures[idx].done():
57 |                     results[idx] = futures[idx].result()
58 |                     new_finished.add(idx)
59 |                     if keys is not None:
60 |                         res[keys[idx]] = results[idx]
61 |             if len(new_finished):
62 |                 if save is not None:
63 |                     dump(res, save)
64 |                 pbar.update(len(new_finished))
65 |                 for k in new_finished:
66 |                     unfinished.remove(k)
67 |             time.sleep(0.1)
68 |         pbar.close()
69 | 
70 |     if save is not None:
71 |         dump(res, save)
72 |     return results
73 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | torch.set_grad_enabled(False)
  4 | torch.manual_seed(1234)
  5 | from .aria import Aria
  6 | from .base import BaseModel
  7 | from .cogvlm import CogVlm, GLM4v
  8 | from .emu import Emu, Emu3_chat, Emu3_gen
  9 | from .eagle_x import Eagle
 10 | from .idefics import IDEFICS, IDEFICS2
 11 | from .instructblip import InstructBLIP
 12 | from .kosmos import Kosmos2
 13 | from .llava import (
 14 |     LLaVA,
 15 |     LLaVA_Next,
 16 |     LLaVA_XTuner,
 17 |     LLaVA_Next2,
 18 |     LLaVA_OneVision,
 19 |     LLaVA_OneVision_HF,
 20 | )
 21 | from .vita import VITA, VITAQwen2
 22 | from .long_vita import LongVITA
 23 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6
 24 | from .minigpt4 import MiniGPT4
 25 | from .mmalaya import MMAlaya, MMAlaya2
 26 | from .monkey import Monkey, MonkeyChat
 27 | from .moondream import Moondream1, Moondream2
 28 | from .minimonkey import MiniMonkey
 29 | from .mplug_owl2 import mPLUG_Owl2
 30 | from .omnilmm import OmniLMM12B
 31 | from .open_flamingo import OpenFlamingo
 32 | from .pandagpt import PandaGPT
 33 | from .qwen_vl import QwenVL, QwenVLChat
 34 | from .qwen2_vl import Qwen2VLChat, Qwen2VLChatAguvis
 35 | from .transcore_m import TransCoreM
 36 | from .visualglm import VisualGLM
 37 | from .xcomposer import (
 38 |     ShareCaptioner,
 39 |     XComposer,
 40 |     XComposer2,
 41 |     XComposer2_4KHD,
 42 |     XComposer2d5,
 43 | )
 44 | from .yi_vl import Yi_VL
 45 | from .internvl import InternVLChat
 46 | from .deepseek_vl import DeepSeekVL
 47 | from .deepseek_vl2 import DeepSeekVL2
 48 | from .janus import Janus
 49 | from .mgm import Mini_Gemini
 50 | from .bunnyllama3 import BunnyLLama3
 51 | from .vxverse import VXVERSE
 52 | from .gemma import PaliGemma, Gemma3
 53 | from .qh_360vl import QH_360VL
 54 | from .phi3_vision import Phi3Vision, Phi3_5Vision
 55 | from .phi4_multimodal import Phi4Multimodal
 56 | from .wemm import WeMM
 57 | from .cambrian import Cambrian
 58 | from .chameleon import Chameleon
 59 | from .video_llm import (
 60 |     VideoLLaVA,
 61 |     VideoLLaVA_HF,
 62 |     Chatunivi,
 63 |     VideoChatGPT,
 64 |     LLaMAVID,
 65 |     VideoChat2_HD,
 66 |     PLLaVA,
 67 | )
 68 | from .vila import VILA, NVILA
 69 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2
 70 | from .mantis import Mantis
 71 | from .mixsense import LLama3Mixsense
 72 | from .parrot import Parrot
 73 | from .omchat import OmChat
 74 | from .rbdash import RBDash
 75 | from .xgen_mm import XGenMM
 76 | from .slime import SliME
 77 | from .mplug_owl3 import mPLUG_Owl3
 78 | from .pixtral import Pixtral
 79 | from .llama_vision import llama_vision
 80 | from .llama4 import llama4
 81 | from .molmo import molmo
 82 | from .points import POINTS, POINTSV15
 83 | from .nvlm import NVLM
 84 | from .vintern_chat import VinternChat
 85 | from .h2ovl_mississippi import H2OVLChat
 86 | from .falcon_vlm import Falcon2VLM
 87 | from .smolvlm import SmolVLM, SmolVLM2
 88 | from .sail_vl import SailVL
 89 | from .valley import Valley2Chat
 90 | from .ross import Ross
 91 | from .ola import Ola
 92 | from .ursa import UrsaChat
 93 | from .vlm_r1 import VLMR1Chat
 94 | from .aki import AKI
 95 | from .ristretto import Ristretto
 96 | from .vlaa_thinker import VLAAThinkerChat
 97 | from .kimi_vl import KimiVL
 98 | from .wethink_vl import WeThinkVL
 99 | from .flash_vl import FlashVL
100 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/chameleon.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import warnings
 3 | from .base import BaseModel
 4 | from ..smp import *
 5 | from PIL import Image
 6 | import torch
 7 | 
 8 | 
 9 | class Chameleon(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self, model_path='facebook/chameleon-7b', **kwargs):
15 |         try:
16 |             from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
17 |         except Exception as e:
18 |             logging.critical('Please install the latest transformers.')
19 |             raise e
20 | 
21 |         processor = ChameleonProcessor.from_pretrained(model_path)
22 |         model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16)
23 | 
24 |         self.model = model.cuda().eval()
25 |         self.processor = processor
26 | 
27 |     def generate_inner(self, message, dataset=None):
28 |         content, images = '', []
29 |         for x in message:
30 |             if x['type'] == 'text':
31 |                 content += x['value']
32 |             elif x['type'] == 'image':
33 |                 content += '<image>\n'
34 |                 images.append(Image.open(x['value']))
35 | 
36 |         inputs = self.processor(
37 |             text=[content],
38 |             images=images,
39 |             padding=True,
40 |             return_tensors='pt'
41 |         ).to(device='cuda', dtype=torch.bfloat16)
42 |         generate_ids = self.model.generate(**inputs, max_new_tokens=2048)
43 |         input_token_len = inputs.input_ids.shape[1]
44 |         text = self.processor.batch_decode(
45 |             generate_ids[:, input_token_len:],
46 |             skip_special_tokens=True,
47 |             clean_up_tokenization_spaces=False
48 |         )[0]
49 |         return text
50 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/falcon_vlm.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import requests
 3 | 
 4 | from .base import BaseModel
 5 | 
 6 | 
 7 | class Falcon2VLM(BaseModel):
 8 | 
 9 |     INSTALL_REQ = False
10 |     INTERLEAVE = False
11 | 
12 |     def __init__(self, model_path='tiiuae/falcon-11B-vlm', **kwargs):
13 |         import torch
14 |         from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
15 | 
16 |         self.model_path = model_path
17 |         self.processor = LlavaNextProcessor.from_pretrained(model_path, tokenizer_class='PreTrainedTokenizerFast')
18 |         self.model = LlavaNextForConditionalGeneration.from_pretrained(
19 |             model_path, torch_dtype=torch.bfloat16, device_map='cuda').eval()
20 |         default_kwargs = {'max_new_tokens': 512}
21 |         default_kwargs.update(kwargs)
22 |         self.kwargs = default_kwargs
23 | 
24 |     def generate_inner(self, message, dataset=None):
25 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
26 |         image = Image.open(image_path).convert('RGB')
27 | 
28 |         prompt = f'User:<image>\n{prompt} Falcon:'
29 |         inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
30 | 
31 |         output = self.model.generate(**inputs, **self.kwargs)
32 |         prompt_length = inputs['input_ids'].shape[1]
33 |         model_response = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()
34 |         return model_response
35 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import os.path as osp
 4 | import sys
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class InstructBLIP(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name):
15 |         self.config_map = {
16 |             'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 |             'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 |         }
19 | 
20 |         self.file_path = __file__
21 |         config_root = osp.dirname(self.file_path)
22 | 
23 |         try:
24 |             from lavis.models import load_preprocess
25 |             from omegaconf import OmegaConf
26 |             from lavis.common.registry import registry
27 |         except Exception as e:
28 |             logging.critical('Please install lavis before using InstructBLIP. ')
29 |             raise e
30 | 
31 |         assert name in self.config_map
32 |         cfg_path = osp.join(config_root, self.config_map[name])
33 |         cfg = OmegaConf.load(cfg_path)
34 | 
35 |         model_cfg = cfg.model
36 |         assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 |         model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 |         model = model_cls.from_config(model_cfg)
39 |         model.eval()
40 | 
41 |         self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 |         device = self.device
43 |         model.to(device)
44 |         self.model = model
45 |         self.kwargs = {'max_length': 512}
46 | 
47 |         preprocess_cfg = cfg.preprocess
48 |         vis_processors, _ = load_preprocess(preprocess_cfg)
49 |         self.vis_processors = vis_processors
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
53 |         vis_processors = self.vis_processors
54 |         raw_image = Image.open(image_path).convert('RGB')
55 |         image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 |         outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 |         return outputs[0]
58 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/internvl/__init__.py:
--------------------------------------------------------------------------------
1 | from .internvl_chat import InternVLChat
2 | 
3 | __all__ = ['InternVLChat']
4 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/internvl/gui_template.yaml:
--------------------------------------------------------------------------------
 1 | ScreenSpot:
 2 |   template_zeroshot: |-
 3 |     Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task}
 4 |   template: |-
 5 |     {task}
 6 |   placeholders:
 7 |     - task
 8 | 
 9 | ScreenSpot_Pro:
10 |   template_zeroshot: |-
11 |     Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task}
12 |   template: |-
13 |     {task}
14 |   placeholders:
15 |     - task
16 | 
17 | ScreenSpot_v2:
18 |   template_zeroshot: |-
19 |     Based on the screenshot of the page, I give a text description and you give the bounding box coordinate of the region this sentence describes: {task}
20 |   template: |-
21 |     {task}
22 |   placeholders:
23 |     - task
24 | 
25 | MM_Mind2Web:
26 |   system_prompt: |-
27 |     You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
28 | 
29 |     You have access to the following functions:
30 |     - {"name": "mobile.swipe", "description": "swipe on the screen", "parameters": {"type": "object", "properties": {"from_coord": {"type": "array", "items": {"type": "number"}, "description": "The starting coordinates of the swipe"}, "to_coord": {"type": "array", "items": {"type": "number"}, "description": "The ending coordinates of the swipe"}}, "required": ["from_coord", "to_coord"]}}
31 |     - {"name": "mobile.home", "description": "Press the home button"}
32 |     - {"name": "mobile.back", "description": "Press the back button"}
33 | 
34 |   template: |-
35 |     Please generate the next move according to the ui screenshot, instruction and previous actions.
36 | 
37 |     Instruction:
38 |     {task}.
39 | 
40 |     Previous actions:
41 |     {history}.
42 | 
43 |   placeholders:
44 |     - task
45 |     - history
46 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
2 | from .llava_xtuner import LLaVA_XTuner
3 | 
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
5 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/minigpt4.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import os.path as osp
 4 | import warnings
 5 | from transformers import StoppingCriteriaList
 6 | from .base import BaseModel
 7 | 
 8 | 
 9 | class MiniGPT4(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self,
15 |                  mode='v2',
16 |                  root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
17 |                  temperature=1,
18 |                  max_out_len=512):
19 | 
20 |         if root is None:
21 |             warnings.warn(
22 |                 'Please set root to the directory of MiniGPT-4, which is cloned from here: '
23 |                 'https://github.com/Vision-CAIR/MiniGPT-4. '
24 |             )
25 | 
26 |         if mode == 'v2':
27 |             cfg = 'minigptv2_eval.yaml'
28 |         elif mode == 'v1_7b':
29 |             cfg = 'minigpt4_7b_eval.yaml'
30 |         elif mode == 'v1_13b':
31 |             cfg = 'minigpt4_13b_eval.yaml'
32 |         else:
33 |             raise NotImplementedError
34 | 
35 |         self.mode = mode
36 |         self.temperature = temperature
37 |         self.max_out_len = max_out_len
38 |         self.root = root
39 |         this_dir = osp.dirname(__file__)
40 | 
41 |         self.cfg = osp.join(this_dir, 'misc', cfg)
42 |         sys.path.append(self.root)
43 | 
44 |         from omegaconf import OmegaConf
45 |         from minigpt4.common.registry import registry
46 |         from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
47 | 
48 |         device = torch.cuda.current_device()
49 |         self.device = device
50 | 
51 |         cfg_path = self.cfg
52 |         cfg = OmegaConf.load(cfg_path)
53 | 
54 |         model_cfg = cfg.model
55 |         model_cfg.device_8bit = device
56 |         model_cls = registry.get_model_class(model_cfg.arch)
57 |         model = model_cls.from_config(model_cfg)
58 |         model = model.to(device)
59 |         model.eval()
60 |         vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
61 |         vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
62 |         self.model = model
63 |         self.vis_processor = vis_processor
64 | 
65 |         self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
66 |         stop_words_ids = [[835], [2277, 29937]]
67 |         stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
68 |         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
69 | 
70 |     def generate_inner(self, message, dataset=None):
71 |         from minigpt4.conversation.conversation import Chat
72 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
73 |         if self.mode == 'v2':
74 |             chat = Chat(self.model, self.vis_processor, device=self.device)
75 |         else:
76 |             chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
77 | 
78 |         chat_state = self.CONV_VISION.copy()
79 |         img_list = []
80 |         _ = chat.upload_img(image_path, chat_state, img_list)
81 |         chat.encode_img(img_list)
82 |         chat.ask(prompt, chat_state)
83 |         with torch.inference_mode():
84 |             msg = chat.answer(conv=chat_state, img_list=img_list)[0]
85 |         return msg
86 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-13b-v0"
25 | 
26 | datasets:
27 |   cc_sbu_align:
28 |     vis_processor:
29 |       train:
30 |         name: "blip2_image_eval"
31 |         image_size: 224
32 |     text_processor:
33 |       train:
34 |         name: "blip_caption"
35 | 
36 | run:
37 |   task: image_text_pretrain
38 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-7b-v0"
25 | 
26 | 
27 | datasets:
28 |   cc_sbu_align:
29 |     vis_processor:
30 |       train:
31 |         name: "blip2_image_eval"
32 |         image_size: 224
33 |     text_processor:
34 |       train:
35 |         name: "blip_caption"
36 | 
37 | run:
38 |   task: image_text_pretrain
39 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt_v2
 3 |   model_type: pretrain
 4 |   max_txt_len: 160
 5 |   end_sym: "</s>"
 6 |   low_resource: True
 7 |   prompt_template: '[INST] {} [/INST]'
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 |   lora_r: 64
10 |   lora_alpha: 16
11 | 
12 |   # vit encoder
13 |   image_size: 448
14 |   drop_path_rate: 0
15 |   use_grad_checkpoint: False
16 |   vit_precision: "fp16"
17 |   freeze_vit: True
18 | 
19 |   # generation configs
20 |   prompt: ""
21 | 
22 |   # LLM
23 |   llama_model: "please set this value to the path of llama2-chat-7b"
24 | 
25 | datasets:
26 |   cc_sbu_align:
27 |     vis_processor:
28 |       train:
29 |         name: "blip2_image_eval"
30 |         image_size: 448
31 |     text_processor:
32 |       train:
33 |         name: "blip_caption"
34 | 
35 | run:
36 |   task: image_text_pretrain
37 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/mixsense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from PIL import Image
 5 | import warnings
 6 | 
 7 | from .base import BaseModel
 8 | from ..smp import *
 9 | 
10 | 
11 | class LLama3Mixsense(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
17 |         assert model_path is not None
18 |         transformers.logging.set_verbosity_error()
19 |         transformers.logging.disable_progress_bar()
20 |         warnings.filterwarnings('ignore')
21 |         self.tokenizer = AutoTokenizer.from_pretrained(
22 |             model_path, trust_remote_code=True
23 |         )
24 |         self.model = AutoModelForCausalLM.from_pretrained(
25 |             model_path, trust_remote_code=True, device_map='auto'
26 |         ).eval()
27 |         self.kwargs = kwargs
28 | 
29 |     def generate_inner(self, message, dataset=None):
30 |         prompt, image_path = self.message_to_promptimg(message)
31 |         input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
32 |         image = Image.open(image_path).convert('RGB')
33 |         image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
34 |         # generate
35 |         with torch.inference_mode():
36 |             output_ids = self.model.generate(
37 |                 input_ids,
38 |                 images=image_tensor,
39 |                 max_new_tokens=2048,
40 |                 use_cache=True,
41 |                 eos_token_id=[
42 |                     self.tokenizer.eos_token_id,
43 |                     self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
44 |                 ],
45 |             )
46 |         return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
47 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/__init__.py:
--------------------------------------------------------------------------------
1 | from .ola_model import Ola
2 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/arguments.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import Optional
 5 | 
 6 | 
 7 | @dataclass
 8 | class ModelArguments:
 9 |     model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
10 |     version: Optional[str] = field(default="v0")
11 |     freeze_backbone: bool = field(default=False)
12 |     tune_speech_projector: bool = field(default=False)
13 |     tune_speech_encoder: bool = field(default=False)
14 |     tune_speech_generator_only: bool = field(default=False)
15 |     speech_encoder_type: Optional[str] = field(default=None)
16 |     speech_encoder: Optional[str] = field(default=None)
17 |     pretrain_speech_projector: Optional[str] = field(default=None)
18 |     speech_projector_type: Optional[str] = field(default='linear')
19 |     speech_encoder_ds_rate: int = 5
20 |     speech_encoder_hidden_size: int = 1280
21 | 
22 | 
23 | @dataclass
24 | class DataArguments:
25 |     data_path: str = field(default=None,
26 |                            metadata={"help": "Path to the training data."})
27 |     is_multimodal: bool = False
28 |     input_type: str = field(default="mel")
29 |     speech_normalize: bool = False
30 |     mel_size: int = 128
31 |     has_tgt_units: bool = False
32 | 
33 | 
34 | @dataclass
35 | class TrainingArguments(transformers.TrainingArguments):
36 |     cache_dir: Optional[str] = field(default=None)
37 |     optim: str = field(default="adamw_torch")
38 |     freeze_speech_projector: bool = field(default=False)
39 |     model_max_length: int = field(
40 |         default=512,
41 |         metadata={
42 |             "help":
43 |             "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
44 |         },
45 |     )
46 |     double_quant: bool = field(
47 |         default=True,
48 |         metadata={"help": "Compress the quantization statistics through double quantization."}
49 |     )
50 |     quant_type: str = field(
51 |         default="nf4",
52 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
53 |     )
54 |     bits: int = field(
55 |         default=16,
56 |         metadata={"help": "How many bits to use."}
57 |     )
58 |     lora_enable: bool = False
59 |     lora_r: int = 64
60 |     lora_alpha: int = 16
61 |     lora_dropout: float = 0.05
62 |     lora_weight_path: str = ""
63 |     lora_bias: str = "none"
64 |     speech_projector_lr: Optional[float] = None
65 |     group_by_modality_length: bool = field(default=False)
66 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | SPEECH_TOKEN_INDEX = -200
 9 | DEFAULT_SPEECH_TOKEN = "<speech>"
10 | IMAGE_TOKEN_INDEX= -300
11 | DEFAULT_IMAGE_TOKEN = "<image>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"
15 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ola/ola/datasets/__init__.py


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.ola_qwen import OlaQwenForCausalLM, OlaConfigQwen
2 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .oryx_vit import SigLIPViTAnysizeWrapper
 3 | 
 4 | def build_vision_tower(vision_tower_cfg, **kwargs):
 5 |     vision_tower = getattr(vision_tower_cfg, 'vision_tower', getattr(vision_tower_cfg, 'mm_vision_tower', None))
 6 |     is_absolute_path_exists = os.path.exists(vision_tower)
 7 |     print(f"Buiding OryxViTWrapper from {vision_tower}...")
 8 |     # path = vision_tower.split(":")[1]
 9 |     return SigLIPViTAnysizeWrapper(vision_tower, path=vision_tower, args=vision_tower_cfg, **kwargs)
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_projector/pooler_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import math
 5 | 
 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel
 7 | import os
 8 | 
 9 | 
10 | class PoolerProjector(nn.Module):
11 |     def __init__(self, config, vision_cfg):
12 |         super().__init__()
13 |         self._config = config
14 |         self.hw = vision_cfg.image_size // vision_cfg.patch_size
15 | 
16 |         self.conv_pool = nn.Conv2d(
17 |             config.mm_hidden_size, config.hidden_size,
18 |             kernel_size=2, stride=2
19 |         )
20 | 
21 |         self.proj = nn.Sequential(
22 |             nn.GELU(),
23 |             nn.Linear(config.hidden_size, config.hidden_size),
24 |         )
25 | 
26 |     def forward(self, x, *args, **kwargs):
27 |         height = width = self.hw
28 |         assert height * width == x.shape[1]
29 |         x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
30 |         x = self.conv_pool(x)
31 |         x = x.flatten(2).transpose(1, 2)
32 |         x = self.proj(x)
33 |         return x
34 | 
35 |     @property
36 |     def config(self):
37 |         return {"mm_projector_type": 'pooler'}
38 | 
39 | 
40 | class NormalizedDwPooler(nn.Module):
41 |     def __init__(self, dim):
42 |         super().__init__()
43 |         self.dim = dim
44 |         self.predictor = nn.Sequential(
45 |             nn.Linear(dim*2, dim),
46 |             nn.GELU(),
47 |             nn.Linear(dim, dim),
48 |         )
49 | 
50 |     def forward(self, x, forward_type='2x'):
51 |         B, H, W, C = x.shape
52 | 
53 |         if forward_type == '2x':
54 |             new_x = x.reshape(B, H//2, 2, W//2, 2, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//2, W//2, 4, C)
55 |             pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 4, -1)
56 |             fused_x = torch.cat([new_x, pooled_x], dim=-1)
57 |         elif forward_type == '1x':
58 |             new_x = x.reshape(B, H, W, 1, C)
59 |             fused_x = torch.cat([new_x, new_x], dim=-1)
60 |         elif forward_type == '4x':
61 |             new_x = x.reshape(B, H//4, 4, W//4, 4, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//4, W//4, 16, C)
62 |             pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 16, -1)
63 |             fused_x = torch.cat([new_x, pooled_x], dim=-1)
64 | 
65 |         score = self.predictor(fused_x)
66 |         normalized_score = F.softmax(score, dim=-2)
67 |         new_x = (new_x * normalized_score).sum(dim=-2)
68 |         return new_x
69 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .perceiver import DynamicCompressor
 4 | 
 5 | class IdentityMap(torch.nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, x, *args, **kwargs):
10 |         return x
11 | 
12 |     @property
13 |     def config(self):
14 |         return {"mm_resampler_type": None}
15 | 
16 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
17 |     # import pdb;pdb.set_trace()
18 |     resampler_type = getattr(model_args, 'mm_resampler_type', None)
19 |     if resampler_type == 'dynamic_compressor':
20 |         return DynamicCompressor(model_args, **kwargs)
21 |     elif resampler_type is None:
22 |         return IdentityMap()
23 |     else:
24 |         raise ValueError(f'Unknown resampler type: {resampler_type}')
25 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/multimodal_resampler/perceiver.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | import torch.nn.functional as F
 5 | import os
 6 | 
 7 | class DynamicCompressor(nn.Module):
 8 |     def __init__(self, model_args, vision_tower):
 9 |         super().__init__()
10 | 
11 |         self.out_channels = vision_tower.hidden_size
12 |         self.mid_channel = 256
13 | 
14 |         self.vlm_query_projector  = nn.Linear(self.out_channels, self.mid_channel)
15 |         self.vlm_key_projector  = nn.Linear(self.out_channels, self.mid_channel)
16 | 
17 |     def downsample(self, x):
18 |         return F.avg_pool2d(x, 2, 2)
19 | 
20 |     def downsample_4(self, x):
21 |         return F.avg_pool2d(x, 4, 4)
22 | 
23 |     def forward(self, image_features, forward_type, image_size=None):
24 |         if image_size is None:
25 |             ori_W = int(math.sqrt(image_features.shape[1]))
26 |             ori_H = int(ori_W)
27 |         else:
28 |             ori_H, ori_W = image_size
29 |         T, N, C = image_features.shape
30 |         image_features = image_features.view(T, ori_H, ori_W, C).permute(0, 3, 1, 2)  # T, C, H, W
31 | 
32 |         if forward_type == 'video':
33 |             image_features_pool = self.downsample(image_features)
34 |             image_feature_attn = image_features.reshape(T, C, ori_H // 2, 2, ori_W // 2, 2).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H // 2 * ori_W // 2, 4, C)
35 |             new_image_size = (ori_H // 2, ori_W // 2)
36 |         elif forward_type == 'image' or forward_type == 'text':
37 |             image_features_pool = image_features
38 |             image_feature_attn = image_features.reshape(T, C, ori_H, 1, ori_W, 1).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H * ori_W, 1, C)
39 |             new_image_size = (ori_H, ori_W)
40 |         elif forward_type == 'video_long':
41 |             image_features_pool = self.downsample_4(image_features)
42 |             image_feature_attn = image_features.reshape(T, C, ori_H // 4, 4, ori_W // 4, 4).permute(0, 2, 4, 3, 5, 1).reshape(T, ori_H // 4 * ori_W // 4, 16, C)
43 |             new_image_size = (ori_H // 4, ori_W // 4)
44 |         else:
45 |             raise NotImplementedError
46 | 
47 |         image_features_pool = image_features_pool.flatten(2).permute(0, 2, 1) # T, H*W, C
48 |         new_t, new_p, _ = image_features_pool.shape
49 | 
50 |         image_query = self.vlm_query_projector(image_features_pool).reshape(new_t*new_p, self.mid_channel)
51 |         image_key = self.vlm_key_projector(image_feature_attn).reshape(new_t*new_p, -1, self.mid_channel)
52 | 
53 |         image_value = image_feature_attn.reshape(new_t*new_p, -1, self.out_channels)
54 |         # import pdb;pdb.set_trace()
55 | 
56 |         image_attn = image_query[:,None] @ (image_key.transpose(-1,-2) / (image_key.shape[-1]**0.5))
57 |         image_attn = image_attn.nan_to_num()
58 |         attn_feat = (image_attn.softmax(-1) @ image_value).mean(1).reshape(new_t, new_p, C)
59 | 
60 |         image_features_pool = image_features_pool + attn_feat
61 | 
62 |         return image_features_pool, new_image_size
63 | 
64 |     @property
65 |     def config(self):
66 |         return {
67 |             'mm_resampler_type': 'dynamic_compressor',
68 |             'mm_out_channels': self.out_channels,
69 |         }
70 | 
71 |     @property
72 |     def hidden_size(self):
73 |         return self.out_channels
74 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_encoder/beats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ola/ola/model/speech_encoder/beats/__init__.py


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_encoder import WhisperWrappedEncoder, DualWrappedEncoder
 2 | 
 3 | 
 4 | def build_speech_encoder(config):
 5 |     speech_encoder_type = getattr(config, 'speech_encoder_type', None)
 6 |     if "whisper" in speech_encoder_type.lower():
 7 |         return WhisperWrappedEncoder.load(config)
 8 |     elif "dual" in speech_encoder_type.lower():
 9 |         return DualWrappedEncoder(config)
10 | 
11 |     raise ValueError(f'Unknown speech encoder: {speech_encoder_type}')
12 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_encoder/speech_encoder.py:
--------------------------------------------------------------------------------
 1 | import types
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from transformers import WhisperFeatureExtractor
 6 | import whisper
 7 | 
 8 | from ....ola.model.speech_encoder.beats.BEATs import BEATsConfig, BEATs
 9 | 
10 | class WhisperWrappedEncoder:
11 | 
12 |     @classmethod
13 |     def load(cls, model_config):
14 | 
15 |         def replace_layer_norm(module):
16 |             from whisper.model import LayerNorm
17 |             for name, child in module.named_children():
18 |                 if isinstance(child, LayerNorm):
19 |                     old_params = child.state_dict()
20 |                     new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine)
21 |                     new_layer_norm.load_state_dict(old_params)
22 |                     setattr(module, name, new_layer_norm)
23 |                 else:
24 |                     replace_layer_norm(child)
25 | 
26 |         encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder
27 |         replace_layer_norm(encoder)
28 |         return encoder
29 | 
30 | class DualWrappedEncoder(nn.Module):
31 |     def __init__(self, config):
32 |         super().__init__()
33 |         self.config = config
34 |         self.whisper_model = self.load_whisper(config)
35 |         self.beats_model = self.load_beats(config)
36 | 
37 |     def load_whisper(cls, model_config):
38 | 
39 |         def replace_layer_norm(module):
40 |             from whisper.model import LayerNorm
41 |             for name, child in module.named_children():
42 |                 if isinstance(child, LayerNorm):
43 |                     old_params = child.state_dict()
44 |                     new_layer_norm = nn.LayerNorm(child.normalized_shape, eps=child.eps, elementwise_affine=child.elementwise_affine)
45 |                     new_layer_norm.load_state_dict(old_params)
46 |                     setattr(module, name, new_layer_norm)
47 |                 else:
48 |                     replace_layer_norm(child)
49 | 
50 |         encoder = whisper.load_model(name=model_config.speech_encoder, device='cpu').encoder
51 |         replace_layer_norm(encoder)
52 |         return encoder
53 | 
54 |     def load_beats(cls, model_config):
55 |         beats_path = model_config.music_encoder
56 |         print("Loading BEATs Model")
57 |         beats_ckpt = torch.load(beats_path, map_location='cpu')
58 |         beats_cfg = BEATsConfig(beats_ckpt['cfg'])
59 |         beats = BEATs(beats_cfg)
60 |         beats.load_state_dict(beats_ckpt['model'])
61 |         return beats
62 | 
63 |     def forward(self, x, raw_wav=None, audio_padding_mask=None):
64 |         with torch.no_grad():
65 |             self.beats_model = self.beats_model.float()
66 |             speech_embeds = self.whisper_model(x.half())
67 |             audio_embeds, _ = self.beats_model.extract_features(raw_wav.float(), padding_mask=audio_padding_mask, feature_only=True)
68 |         if audio_embeds.size(1) < speech_embeds.size(1):
69 |             audio_embeds = F.pad(audio_embeds, (0, 0, 0, speech_embeds.size(1) - audio_embeds.size(1)))
70 |         elif audio_embeds.size(1) > speech_embeds.size(1):
71 |             speech_embeds = F.pad(speech_embeds, (0, 0, 0, audio_embeds.size(1) - speech_embeds.size(1)))
72 |         speech_embeds = torch.cat((speech_embeds, audio_embeds), dim=-1)
73 |         speech_embeds = speech_embeds.to(torch.bfloat16)
74 |         return speech_embeds
75 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_projector/builder.py:
--------------------------------------------------------------------------------
 1 | from .speech_projector import EncoderProjectorConcat
 2 | 
 3 | 
 4 | def build_speech_projector(config):
 5 |     projector_type = getattr(config, 'speech_projector_type', 'linear')
 6 |     if projector_type == 'linear':
 7 |         return EncoderProjectorConcat(config)
 8 | 
 9 |     raise ValueError(f'Unknown projector type: {projector_type}')
10 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ola/ola/model/speech_projector/speech_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | class EncoderProjectorConcat(nn.Module):
 6 |     def __init__(self, config):
 7 |         super().__init__()
 8 |         self.k = config.speech_encoder_ds_rate
 9 |         self.encoder_dim = config.speech_encoder_hidden_size
10 |         self.llm_dim = config.hidden_size
11 |         self.linear1 = nn.Linear(self.encoder_dim * self.k, 2048)
12 |         self.relu = nn.ReLU()
13 |         self.linear2 = nn.Linear(2048, config.hidden_size)
14 | 
15 |         embed_std = 1 / math.sqrt(config.hidden_size)
16 |         self.speech_newline = nn.Parameter(
17 |             torch.randn(config.hidden_size) * embed_std
18 |         )
19 |         self.speech_begin = nn.Parameter(
20 |             torch.randn(config.hidden_size) * embed_std
21 |         )
22 |         self.speech_end = nn.Parameter(
23 |             torch.randn(config.hidden_size) * embed_std
24 |         )
25 | 
26 |     def forward(self, x):
27 |         batch_size, seq_len, dim = x.size()
28 |         num_frames_to_discard = seq_len % self.k
29 |         if num_frames_to_discard > 0:
30 |             x = x[:, :-num_frames_to_discard, :]
31 |         seq_len = x.size(1)
32 | 
33 |         x = x.contiguous()
34 |         x = x.view(batch_size, seq_len // self.k, dim * self.k)
35 |         x = self.linear1(x)
36 |         x = self.relu(x)
37 |         x = self.linear2(x)
38 |         x = torch.cat([
39 |             x,
40 |             self.speech_newline.reshape(1, 1, -1).expand(batch_size, 1, -1).to(x.dtype)
41 |         ], dim=1)
42 |         begin = self.speech_begin.reshape(1, -1).to(x.dtype)
43 |         end = self.speech_end.reshape(1, -1).to(x.dtype)
44 |         x = x.flatten(0, 1)
45 |         x = torch.cat([begin, x, end], dim=0)
46 |         # x = x.flatten(0, 1)
47 |         return x
48 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ovis/__init__.py:
--------------------------------------------------------------------------------
1 | from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2
2 | 
3 | __all__ = ['Ovis', 'Ovis1_6', 'Ovis1_6_Plus', 'Ovis2']
4 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ovis/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/open-compass/VLMEvalKit/cb1b907680808ffb3c314b8a34e0c90a7a3db1de/vlmeval/vlm/ovis/utils/__init__.py


--------------------------------------------------------------------------------
/vlmeval/vlm/pandagpt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import os.path as osp
 4 | import warnings
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class PandaGPT(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name, root=None, **kwargs):
15 |         if root is None:
16 |             raise ValueError('Please set `root` to PandaGPT code directory, which is cloned from here: ')
17 | 
18 |         assert name == 'PandaGPT_13B'
19 |         self.name = name
20 |         sys.path.append(osp.join(root, 'code'))
21 |         try:
22 |             from model.openllama import OpenLLAMAPEFTModel
23 |         except Exception as e:
24 |             logging.critical(
25 |                 'Please first install PandaGPT and set the root path to use PandaGPT, '
26 |                 'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
27 |             )
28 |             raise e
29 | 
30 |         self.args = {
31 |             'model': 'openllama_peft',
32 |             'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
33 |             'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
34 |             'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
35 |             'stage': 2,
36 |             'max_tgt_len': 512,
37 |             'lora_r': 32,
38 |             'lora_alpha': 32,
39 |             'lora_dropout': 0.1,
40 |         }
41 |         model = OpenLLAMAPEFTModel(**self.args)
42 |         delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
43 |         model.load_state_dict(delta_ckpt, strict=False)
44 |         torch.cuda.empty_cache()
45 |         self.model = model.eval().half().cuda()
46 |         kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
47 |         kwargs_default.update(kwargs)
48 |         self.kwargs = kwargs_default
49 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
53 |         struct = {
54 |             'prompt': prompt,
55 |             'image_paths': [image_path],
56 |             'audio_paths': [],
57 |             'video_paths': [],
58 |             'thermal_paths': [],
59 |             'modality_embeds': []
60 |         }
61 |         struct.update(self.kwargs)
62 |         resp = self.model.generate(struct)
63 |         return resp
64 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/phi4_multimodal.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class Phi4Multimodal(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='microsoft/Phi-4-multimodal-instruct', **kwargs):
14 |         try:
15 |             from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
16 |         except Exception as e:
17 |             logging.critical('Please install the latest version transformers.')
18 |             raise e
19 | 
20 |         model = AutoModelForCausalLM.from_pretrained(
21 |             model_path, device_map='cuda', trust_remote_code=True,
22 |             torch_dtype='auto',attn_implementation='flash_attention_2'
23 |         ).eval()
24 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
25 |         generation_config = GenerationConfig.from_pretrained(model_path)
26 | 
27 |         self.model = model
28 |         self.processor = processor
29 |         # self.kwargs = kwargs
30 |         self.generation_config = generation_config
31 | 
32 |     def generate_inner(self, message, dataset=None):
33 |         user_question = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text'])
34 |         images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image']
35 | 
36 |         user_prompt = '<|user|>'
37 |         assistant_prompt = '<|assistant|>'
38 |         prompt_suffix = '<|end|>'
39 |         prompt = f'{user_prompt}<|image_1|>{user_question}{prompt_suffix}{assistant_prompt}'
40 |         inputs = self.processor(text=prompt, images=images[0], return_tensors='pt').to('cuda')
41 | 
42 |         # Generate response
43 |         generate_ids = self.model.generate(
44 |             **inputs,
45 |             max_new_tokens=1000,
46 |             generation_config=self.generation_config,
47 |         )
48 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
49 |         response = self.processor.batch_decode(
50 |             generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
51 |         )[0]
52 |         return response
53 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/pixtral.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from .base import BaseModel
 4 | from ..smp import *
 5 | import warnings
 6 | from huggingface_hub import snapshot_download
 7 | 
 8 | 
 9 | class Pixtral(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs):
15 | 
16 |         self.model_path = model_path
17 |         try:
18 |             from mistral_inference.transformer import Transformer
19 |             from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
20 |         except ImportError as err:
21 |             logging.critical('Please install `mistral-inference` and `mistral_common`')
22 |             raise err
23 | 
24 |         if os.path.exists(model_path):
25 |             cache_path = model_path
26 |         else:
27 |             if get_cache_path(model_path) is None:
28 |                 snapshot_download(repo_id=model_path)
29 |             cache_path = get_cache_path(self.model_path, repo_type='models')
30 | 
31 |         self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json')
32 |         model = Transformer.from_folder(cache_path, device='cpu')
33 |         model.cuda()
34 |         self.model = model
35 |         self.max_tokens = 2048
36 | 
37 |     def generate_inner(self, message, dataset=None):
38 |         try:
39 |             from mistral_inference.generate import generate
40 |             from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
41 |             from mistral_common.protocol.instruct.request import ChatCompletionRequest
42 |         except ImportError as err:
43 |             logging.critical('Please install `mistral-inference` and `mistral_common`')
44 |             raise err
45 | 
46 |         msg_new = []
47 |         for msg in message:
48 |             tp, val = msg['type'], msg['value']
49 |             if tp == 'text':
50 |                 msg_new.append(TextChunk(text=val))
51 |             elif tp == 'image':
52 |                 b64 = encode_image_file_to_base64(val)
53 |                 image_url = f'data:image/jpeg;base64,{b64}'
54 |                 msg_new.append(ImageURLChunk(image_url=image_url))
55 | 
56 |         completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)])
57 |         encoded = self.tokenizer.encode_chat_completion(completion_request)
58 |         images = encoded.images
59 |         tokens = encoded.tokens
60 | 
61 |         out_tokens, _ = generate(
62 |             [tokens],
63 |             self.model,
64 |             images=[images],
65 |             max_tokens=self.max_tokens,
66 |             temperature=0,
67 |             eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id)
68 | 
69 |         result = self.tokenizer.decode(out_tokens[0])
70 |         return result
71 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qh_360vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import os.path as osp
 5 | from PIL import Image
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..dataset import DATASET_TYPE
 9 | 
10 | 
11 | class QH_360VL(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
17 |         assert model_path is not None
18 |         self.model_path = model_path
19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20 |         self.model = AutoModelForCausalLM.from_pretrained(model_path,
21 |                                                           torch_dtype=torch.float16,
22 |                                                           low_cpu_mem_usage=True,
23 |                                                           device_map='auto',
24 |                                                           trust_remote_code=True).eval()
25 |         vision_tower = self.model.get_vision_tower()
26 |         vision_tower.load_model()
27 |         vision_tower.to(device='cuda', dtype=torch.float16)
28 |         self.image_processor = vision_tower.image_processor
29 |         self.tokenizer.pad_token = self.tokenizer.eos_token
30 |         self.kwargs = kwargs
31 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
32 |         torch.cuda.empty_cache()
33 | 
34 |     def generate(self, message, dataset=None):
35 | 
36 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
37 |         print(prompt)
38 |         image = Image.open(image_path).convert('RGB')
39 |         terminators = [
40 |             self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
41 |         ]
42 |         inputs = self.model.build_conversation_input_ids(self.tokenizer,
43 |                                                          query=prompt,
44 |                                                          image=image,
45 |                                                          image_processor=self.image_processor)
46 |         input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
47 |         images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
48 | 
49 |         output_ids = self.model.generate(input_ids=input_ids,
50 |                                          images=images,
51 |                                          do_sample=False,
52 |                                          num_beams=1,
53 |                                          max_new_tokens=512,
54 |                                          eos_token_id=terminators,
55 |                                          use_cache=True)
56 | 
57 |         input_token_len = input_ids.shape[1]
58 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
59 |         response = outputs.strip()
60 | 
61 |         return response
62 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qwen2_vl/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Qwen2VLChat, Qwen2VLChatAguvis
2 | from .prompt import Qwen2VLPromptMixin
3 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/slime.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from abc import abstractproperty
 4 | import sys
 5 | import os.path as osp
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..dataset import DATASET_TYPE
 9 | import copy
10 | 
11 | 
12 | class SliME(BaseModel):
13 | 
14 |     INSTALL_REQ = True
15 |     INTERLEAVE = True
16 | 
17 |     DEFAULT_IMAGE_TOKEN = '<image>'
18 |     IMAGE_TOKEN_INDEX = -200
19 | 
20 |     def __init__(self, model_path='yifanzhang114/SliME-Llama3-8B', **kwargs):
21 |         assert model_path is not None
22 |         try:
23 |             from llava.model.builder import load_pretrained_model
24 |             from llava.conversation import conv_templates
25 |             from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
26 |         except Exception as err:
27 |             logging.critical('Please install requirements on https://github.com/yfzhang114/SliME before using SliME')
28 |             raise err
29 | 
30 |         model_name = get_model_name_from_path(model_path)
31 |         tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name, device_map=None)
32 |         model.cuda().eval()
33 |         model.tie_weights()
34 | 
35 |         if 'llama3' in model_path.lower():
36 |             conv_mode = 'llama3'
37 |         elif 'vicuna' in model_path.lower():
38 |             conv_mode = 'v1'
39 |         self.conv_template = conv_mode
40 |         self.conv_templates = conv_templates
41 |         self.tokenizer = tokenizer
42 |         self.model = model
43 |         self.image_processor = image_processor
44 |         self.tokenizer_image_token = tokenizer_image_token
45 | 
46 |     def generate_inner(self, message, dataset=None):
47 |         content, images = '', []
48 |         for msg in message:
49 |             if msg['type'] == 'text':
50 |                 content += msg['value']
51 |             else:
52 |                 images.append(Image.open(msg['value']).convert('RGB'))
53 |                 content += (self.DEFAULT_IMAGE_TOKEN + '\n')
54 | 
55 |         preprocess = self.image_processor.preprocess
56 |         image_tokenizer = self.tokenizer_image_token
57 |         image_tensor = [
58 |             preprocess(f, return_tensors='pt')['pixel_values'][0].half().cuda() for f in images
59 |         ]
60 |         image_tensor = torch.stack(image_tensor)
61 | 
62 |         conv = copy.deepcopy(self.conv_templates[self.conv_template])
63 |         conv.messages = list(conv.messages)
64 |         conv.append_message(conv.roles[0], content)
65 |         conv.append_message(conv.roles[1], None)
66 |         prompt_question = conv.get_prompt()
67 | 
68 |         input_ids = image_tokenizer(prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors='pt')
69 |         input_ids = input_ids.unsqueeze(0).cuda()
70 | 
71 |         cont = self.model.generate(
72 |             input_ids,
73 |             images=image_tensor,
74 |             do_sample=False,
75 |             temperature=0,
76 |             max_new_tokens=512,
77 |         )
78 |         text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
79 |         return text_outputs
80 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/__init__.py:
--------------------------------------------------------------------------------
1 | from .ursa_chat import UrsaChat
2 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/ursa_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | 
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | 
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from .image_processing_vlm import VLMImageProcessor, VLMImageProcessorConfig
15 | from .modeling_ursa import UrsaForConditionalGeneration, UrsaForTokenClassification
16 | from .processing_ursa import UrsaProcessor
17 | from .configuration_ursa import VisionConfig, UrsaConfig, AlignerConfig
18 | from .projector import MlpProjector
19 | 
20 | __all__ = [
21 |     "VLMImageProcessor",
22 |     "UrsaProcessor",
23 |     "UrsaForConditionalGeneration",
24 |     "UrsaForTokenClassification",
25 |     "VLMImageProcessorConfig",
26 |     "VisionConfig",
27 |     "MlpProjector",
28 |     "AlignerConfig",
29 |     "UrsaConfig"
30 | ]
31 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/ursa/ursa_model/processing_ursa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (2025) Bytedance Ltd. and/or its affiliates
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | 
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | 
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from typing import List, Optional, Union
15 | 
16 | from transformers.feature_extraction_utils import BatchFeature
17 | from transformers.image_utils import ImageInput
18 | from transformers.processing_utils import ProcessorMixin
19 | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
20 | from transformers.utils import TensorType
21 | 
22 | 
23 | class UrsaProcessor(ProcessorMixin):
24 |     attributes = ["image_processor", "tokenizer"]
25 |     valid_kwargs = ["chat_template"]
26 |     image_processor_class = "AutoImageProcessor"
27 |     tokenizer_class = "AutoTokenizer"
28 | 
29 |     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
30 |         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
31 | 
32 |     def __call__(
33 |         self,
34 |         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
35 |         images: ImageInput = None,
36 |         padding: Union[bool, str, PaddingStrategy] = False,
37 |         truncation: Union[bool, str, TruncationStrategy] = None,
38 |         max_length=None,
39 |         return_tensors: Optional[Union[str, TensorType]] = None,    # or TensorType.PYTORCH
40 |     ) -> BatchFeature:
41 |         image_inputs = {}
42 |         if images is not None:
43 |             image_inputs = self.image_processor(images, return_tensors=return_tensors)
44 |         text_inputs = self.tokenizer(
45 |             text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
46 |         )
47 |         return BatchFeature(data={**text_inputs, **image_inputs})
48 | 
49 |     def decode(self, *args, **kwargs):
50 |         return self.tokenizer.decode(*args, **kwargs)
51 | 
52 |     def batch_decode(self, *args, **kwargs):
53 |         return self.tokenizer.batch_decode(*args, **kwargs)
54 | 
55 |     @property
56 |     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
57 |     def model_input_names(self):
58 |         tokenizer_input_names = self.tokenizer.model_input_names
59 |         image_processor_input_names = self.image_processor.model_input_names
60 |         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
61 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/__init__.py:
--------------------------------------------------------------------------------
1 | from .valley import Valley2Chat
2 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/valley/requirements_valley.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.34.2
 2 | bert-score==0.3.13
 3 | byted-wandb==0.13.72
 4 | datasets==2.21.0
 5 | einops==0.8.0
 6 | evaluate==0.4.3
 7 | fastapi==0.115.0
 8 | flash_attn
 9 | ftfy==6.2.3
10 | markdown2==2.5.0
11 | ninja==1.11.1.1
12 | nltk==3.9.1
13 | numpy==1.26.4
14 | omegaconf==2.3.0
15 | openai==0.28
16 | opencv-python-headless==4.10.0.84
17 | packaging==24.1
18 | pandas==2.2.2
19 | peft==0.5.0
20 | prettytable==3.11.0
21 | protobuf==3.20.3
22 | pyarrow==15.0.0
23 | pydantic==1.10.14
24 | qwen_vl_utils
25 | requests==2.32.3
26 | rouge-score==0.1.2
27 | scikit-image==0.24.0
28 | scikit-learn==1.5.2
29 | sentencepiece==0.1.97
30 | timm==0.6.7
31 | tokenizers>=0.13.3
32 | torchmetrics
33 | transformers==4.45.2
34 | uvicorn==0.30.6
35 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .video_llava import VideoLLaVA, VideoLLaVA_HF
2 | from .videochat2 import VideoChat2_HD
3 | from .chat_uni_vi import Chatunivi
4 | from .video_chatgpt import VideoChatGPT
5 | from .llama_vid import LLaMAVID
6 | from .pllava import PLLaVA
7 | 
8 | __all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA']
9 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": 224,
 3 |   "do_center_crop": true,
 4 |   "do_normalize": true,
 5 |   "do_resize": true,
 6 |   "feature_extractor_type": "CLIPFeatureExtractor",
 7 |   "image_mean": [
 8 |     0.48145466,
 9 |     0.4578275,
10 |     0.40821073
11 |   ],
12 |   "image_std": [
13 |     0.26862954,
14 |     0.26130258,
15 |     0.27577711
16 |   ],
17 |   "resample": 3,
18 |   "size": 224
19 | }
20 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/configs/videochat2_hd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": {
 3 |       "model_cls": "VideoChat2_it_hd_mistral",
 4 |       "vit_blip_model_path": "OpenGVLab/videochat2",
 5 |       "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
 6 |       "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B",
 7 |       "freeze_vit": false,
 8 |       "freeze_qformer": false,
 9 |       "max_txt_len": 512,
10 |       "low_resource": false,
11 |       "vision_encoder": {
12 |         "name": "vit_l14",
13 |         "img_size": 224,
14 |         "patch_size": 16,
15 |         "d_model": 1024,
16 |         "encoder_embed_dim": 1024,
17 |         "encoder_depth": 24,
18 |         "encoder_num_heads": 16,
19 |         "drop_path_rate": 0.0,
20 |         "num_frames": 8,
21 |         "tubelet_size": 1,
22 |         "use_checkpoint": true,
23 |         "checkpoint_num": 18,
24 |         "pretrained": "",
25 |         "return_index": -2,
26 |         "vit_add_ln": true,
27 |         "ckpt_num_frame": 4
28 |       },
29 |       "num_query_token": 32,
30 |       "qformer_hidden_dropout_prob": 0.1,
31 |       "qformer_attention_probs_dropout_prob": 0.1,
32 |       "qformer_drop_path_rate": 0.2,
33 |       "extra_num_query_token": 64,
34 |       "qformer_text_input": true,
35 |       "system": "",
36 |       "start_token": "<Video>",
37 |       "end_token": "</Video>",
38 |       "add_second_msg": true,
39 |       "img_start_token": "<Image>",
40 |       "img_end_token": "</Image>",
41 |       "random_shuffle": true,
42 |       "return_question_instruction": false,
43 |       "use_flash_attention": true,
44 |       "use_lora": false,
45 |       "lora_r": 16,
46 |       "lora_alpha": 32,
47 |       "lora_dropout": 0.1,
48 |       "dynamic_config": {
49 |         "local_size": 224,
50 |         "hd_num": 6,
51 |         "padding": false,
52 |         "add_global": true
53 |       }
54 |     },
55 |     "device": "cuda"
56 | }
57 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/video_llm/video_chatgpt.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import warnings
 4 | import copy as cp
 5 | import numpy as np
 6 | import sys
 7 | import logging
 8 | from ..base import BaseModel
 9 | from ...smp import isimg, listinstr
10 | from ...dataset import DATASET_TYPE
11 | from huggingface_hub import snapshot_download
12 | 
13 | 
14 | class VideoChatGPT(BaseModel):
15 |     INSTALL_REQ = True
16 |     INTERLEAVE = False
17 |     VIDEO_LLM = True
18 |     # sample a video in 100 frames
19 | 
20 |     def __init__(self, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=None, **kwargs):
21 |         assert model_path is not None
22 |         sys.path.append(dir_root)
23 |         try:
24 |             from video_chatgpt.eval.model_utils import initialize_model
25 |         except Exception as err:
26 |             logging.critical(
27 |                 'Please first install requirements and set the root path to use Video-ChatGPT. \
28 |                 Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.'
29 |             )
30 |             raise err
31 |         base_model_path = snapshot_download('mmaaz60/LLaVA-7B-Lightening-v1-1')
32 |         projection_path = snapshot_download(model_path)
33 |         projection_name = 'video_chatgpt-7B.bin'
34 |         projection_path = os.path.join(projection_path, projection_name)
35 | 
36 |         model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(
37 |             base_model_path, projection_path
38 |         )
39 |         self.tokenizer = tokenizer
40 |         self.model = model
41 |         self.processor = image_processor
42 |         self.context_len = video_token_len
43 |         self.kwargs = kwargs
44 |         self.vision_tower = vision_tower
45 | 
46 |     def get_model_output(self, model, video_processor, tokenizer, video, qs):
47 |         from video_chatgpt.eval.model_utils import load_video
48 |         from video_chatgpt.inference import video_chatgpt_infer
49 |         conv_mode = 'video-chatgpt_v1'
50 | 
51 |         video_frames = load_video(video)
52 |         # Run inference on the video and questions
53 |         output = video_chatgpt_infer(
54 |             video_frames,
55 |             qs,
56 |             conv_mode,
57 |             model,
58 |             self.vision_tower,
59 |             tokenizer,
60 |             video_processor,
61 |             self.context_len,
62 |         )
63 |         return output
64 | 
65 |     def generate_inner(self, message, dataset=None):
66 |         question, video = self.message_to_promptvideo(message)
67 |         response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
68 |         return response
69 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from .base import BaseModel
 3 | from ..smp import *
 4 | 
 5 | 
 6 | class VisualGLM(BaseModel):
 7 | 
 8 |     INSTALL_REQ = False
 9 |     INTERLEAVE = False
10 | 
11 |     def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 |         try:
13 |             import sat
14 |         except Exception as err:
15 |             logging.critical('Please install SwissArmyTransformer to use VisualGLM')
16 |             raise err
17 | 
18 |         assert model_path is not None
19 |         self.model_path = model_path
20 | 
21 |         from transformers import AutoModel
22 |         from transformers import AutoTokenizer
23 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
24 |         model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
25 |         self.model = model
26 |         self.kwargs = kwargs
27 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
28 | 
29 |     def generate_inner(self, message, dataset=None):
30 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
31 |         output, _ = self.model.chat(
32 |             image_path=image_path,
33 |             tokenizer=self.tokenizer,
34 |             query=prompt,
35 |             history=[],
36 |             **self.kwargs
37 |         )
38 |         return output
39 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/wemm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import sys
 4 | from ..smp import *
 5 | from .base import BaseModel
 6 | from ..dataset import DATASET_TYPE
 7 | from transformers import AutoModel, GenerationConfig
 8 | 
 9 | 
10 | class WeMM(BaseModel):
11 |     def __init__(self, model_path='feipengma/WeMM', **kwargs):
12 |         self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
13 |         self.wemm.cuda()
14 |         self.wemm.eval()
15 |         torch.cuda.empty_cache()
16 | 
17 |     def use_custom_prompt(self, dataset):
18 |         assert dataset is not None
19 |         if DATASET_TYPE(dataset) == 'MCQ':
20 |             return True
21 |         return False
22 | 
23 |     def build_prompt(self, line, dataset=None):
24 |         assert self.use_custom_prompt(dataset)
25 |         assert dataset is None or isinstance(dataset, str)
26 |         tgt_path = self.dump_image(line, dataset)
27 |         question = line['question']
28 |         hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
29 |         if hint is not None:
30 |             question = hint + '\n' + question
31 |         options = {
32 |             cand: line[cand]
33 |             for cand in string.ascii_uppercase
34 |             if cand in line and not pd.isna(line[cand])
35 |         }
36 |         for key, item in options.items():
37 |             question += f'\n{key}. {item}'
38 |         prompt = question
39 | 
40 |         if len(options):
41 |             prompt += (
42 |                 '\n请直接回答选项字母。' if cn_string(prompt) else
43 |                 "\nAnswer with the option's letter from the given choices directly."
44 |             )
45 |         else:
46 |             prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
47 | 
48 |         message = [dict(type='text', value=prompt)]
49 |         message.extend([dict(type='image', value=p) for p in tgt_path])
50 |         return message
51 | 
52 |     def generate_inner(self, message, dataset=None):
53 |         prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
54 | 
55 |         if dataset == 'HallusionBench':
56 |             prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
57 | 
58 |         gen_config = None
59 |         if dataset == 'MMVet':
60 |             gen_config = GenerationConfig(
61 |                 max_new_tokens=512,
62 |                 do_sample=True,
63 |                 temperatures=0.7,
64 |                 num_beams=3,
65 |                 eos_token_id=self.wemm.tokenizer.eos_token_id,
66 |                 pad_token_id=self.wemm.tokenizer.pad_token_id
67 |                 if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
68 |             )
69 |         pred = self.wemm.mm_generate(image_path, prompt, gen_config)
70 | 
71 |         return pred
72 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 | from .xcomposer2d5 import XComposer2d5
6 | 
7 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
8 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/xgen_mm.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class XGenMM(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = True
12 | 
13 |     def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs):
14 |         try:
15 |             from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
16 |         except Exception as err:
17 |             logging.critical('Please install the latest version transformers.')
18 |             raise err
19 | 
20 |         model = AutoModelForVision2Seq.from_pretrained(
21 |             model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto'
22 |         ).eval()
23 | 
24 |         tokenizer = AutoTokenizer.from_pretrained(
25 |             model_path, trust_remote_code=True, use_fast=False, legacy=False
26 |         )
27 |         tokenizer = model.update_special_tokens(tokenizer)
28 |         tokenizer.eos_token = '<|end|>'
29 |         tokenizer.padding_side = 'left'
30 |         image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
31 |         self.model = model
32 |         self.image_processor = image_processor
33 |         self.tokenizer = tokenizer
34 |         self.kwargs = kwargs
35 | 
36 |     def apply_prompt_template(self, query):
37 |         s = (
38 |             '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
39 |             "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
40 |             f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
41 |         )
42 |         return s
43 | 
44 |     def generate_inner(self, message, dataset=None):
45 | 
46 |         content, images, image_sizes = '', [], []
47 | 
48 |         for msg in message:
49 |             if msg['type'] == 'text':
50 |                 content += msg['value']
51 |             elif msg['type'] == 'image':
52 |                 image = Image.open(msg['value']).convert('RGB')
53 |                 images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda'))
54 |                 image_sizes.append(image.size)
55 |                 content += '<image> '
56 | 
57 |         inputs = {'pixel_values': [images]}
58 |         prompt = self.apply_prompt_template(content)
59 |         language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda')
60 |         inputs.update(language_inputs)
61 | 
62 |         generation_args = {
63 |             'max_new_tokens': 1024,
64 |             'temperature': 0.0,
65 |             'do_sample': False,
66 |             'top_p': None,
67 |             'num_beams': 1
68 |         }
69 |         generation_args.update(self.kwargs)
70 | 
71 |         generate_ids = self.model.generate(
72 |             **inputs, image_size=[image_sizes],
73 |             pad_token_id=self.tokenizer.pad_token_id,
74 |             eos_token_id=self.tokenizer.eos_token_id,
75 |             **generation_args
76 |         )
77 | 
78 |         # remove input tokens
79 |         response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0]
80 | 
81 |         return response
82 | 


--------------------------------------------------------------------------------