├── .DS_Store
├── README.md
├── VLMEvalKit
├── scripts
│ ├── AI2D_preproc.ipynb
│ ├── apires_scan.py
│ ├── auto_run.py
│ ├── cover.sh
│ ├── mmb_eval_gradio.py
│ ├── run.sh
│ ├── srun.sh
│ ├── summarize.py
│ └── visualize.ipynb
├── test_Mantis.py
├── test_interlvl1.5.py
├── test_internvl2-pro.py
├── test_models.py
├── utils
│ ├── __pycache__
│ │ └── conversation.cpython-39.pyc
│ ├── conversation.py
│ └── tools.py
└── vlmeval
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-39.pyc
│ ├── config.cpython-39.pyc
│ └── tools.cpython-39.pyc
│ ├── api
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-39.pyc
│ │ ├── base.cpython-39.pyc
│ │ ├── claude.cpython-39.pyc
│ │ ├── gemini.cpython-39.pyc
│ │ ├── glm_vision.cpython-39.pyc
│ │ ├── gpt.cpython-39.pyc
│ │ ├── gpt_int.cpython-39.pyc
│ │ ├── hf_chat_model.cpython-39.pyc
│ │ ├── qwen_api.cpython-39.pyc
│ │ ├── qwen_vl_api.cpython-39.pyc
│ │ ├── reka.cpython-39.pyc
│ │ └── stepai.cpython-39.pyc
│ ├── base.py
│ ├── claude.py
│ ├── gemini.py
│ ├── glm_vision.py
│ ├── gpt.py
│ ├── gpt_int.py
│ ├── hf_chat_model.py
│ ├── qwen_api.py
│ ├── qwen_vl_api.py
│ ├── reka.py
│ └── stepai.py
│ ├── config.py
│ ├── evaluate
│ ├── OCRBench.py
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── OCRBench.cpython-39.pyc
│ │ ├── __init__.cpython-39.pyc
│ │ ├── coco_eval.cpython-39.pyc
│ │ ├── llavabench.cpython-39.pyc
│ │ ├── mathvista_eval.cpython-39.pyc
│ │ ├── misc.cpython-39.pyc
│ │ ├── mmvet_eval.cpython-39.pyc
│ │ ├── multiple_choice.cpython-39.pyc
│ │ ├── vqa_eval.cpython-39.pyc
│ │ └── yes_or_no.cpython-39.pyc
│ ├── coco_eval.py
│ ├── llavabench.py
│ ├── mathvista_eval.py
│ ├── misc.py
│ ├── mmvet_eval.py
│ ├── multiple_choice.py
│ ├── multiple_choice_mmeval.py
│ ├── vqa_eval.py
│ └── yes_or_no.py
│ ├── inference.py
│ ├── smp
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-39.pyc
│ │ ├── file.cpython-39.pyc
│ │ ├── log.cpython-39.pyc
│ │ ├── misc.cpython-39.pyc
│ │ └── vlm.cpython-39.pyc
│ ├── file.py
│ ├── log.py
│ ├── misc.py
│ └── vlm.py
│ ├── tools.py
│ ├── utils
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-39.pyc
│ │ ├── custom_prompt.cpython-39.pyc
│ │ ├── dataset.cpython-39.pyc
│ │ ├── dataset_config.cpython-39.pyc
│ │ ├── matching_util.cpython-39.pyc
│ │ ├── mp_util.cpython-39.pyc
│ │ └── result_transfer.cpython-39.pyc
│ ├── custom_prompt.py
│ ├── dataset.py
│ ├── dataset_config.py
│ ├── matching_util.py
│ ├── mp_util.py
│ └── result_transfer.py
│ └── vlm
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-39.pyc
│ ├── base.cpython-39.pyc
│ ├── bunnyllama3.cpython-39.pyc
│ ├── cogvlm.cpython-39.pyc
│ ├── deepseek_vl.cpython-39.pyc
│ ├── emu.cpython-39.pyc
│ ├── idefics.cpython-39.pyc
│ ├── instructblip.cpython-39.pyc
│ ├── internvl_chat.cpython-39.pyc
│ ├── mgm.cpython-39.pyc
│ ├── minicpm_v.cpython-39.pyc
│ ├── minigpt4.cpython-39.pyc
│ ├── mmalaya.cpython-39.pyc
│ ├── monkey.cpython-39.pyc
│ ├── mplug_owl2.cpython-39.pyc
│ ├── omnilmm.cpython-39.pyc
│ ├── open_flamingo.cpython-39.pyc
│ ├── paligemma.cpython-39.pyc
│ ├── pandagpt.cpython-39.pyc
│ ├── phi3_vision.cpython-39.pyc
│ ├── qh_360vl.cpython-39.pyc
│ ├── qwen_vl.cpython-39.pyc
│ ├── transcore_m.cpython-39.pyc
│ ├── visualglm.cpython-39.pyc
│ ├── vxverse.cpython-39.pyc
│ ├── wemm.cpython-39.pyc
│ └── yi_vl.cpython-39.pyc
│ ├── base.py
│ ├── bunnyllama3.py
│ ├── cogvlm.py
│ ├── deepseek_vl.py
│ ├── emu.py
│ ├── idefics.py
│ ├── instructblip.py
│ ├── internvl_chat.py
│ ├── llava
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-39.pyc
│ │ ├── llava.cpython-39.pyc
│ │ └── llava_xtuner.cpython-39.pyc
│ ├── llava.py
│ └── llava_xtuner.py
│ ├── mgm.py
│ ├── minicpm_v.py
│ ├── minigpt4.py
│ ├── misc
│ ├── blip2_instruct_vicuna13b.yaml
│ ├── blip2_instruct_vicuna7b.yaml
│ ├── minigpt4_13b_eval.yaml
│ ├── minigpt4_7b_eval.yaml
│ └── minigptv2_eval.yaml
│ ├── mmalaya.py
│ ├── monkey.py
│ ├── mplug_owl2.py
│ ├── omnilmm.py
│ ├── open_flamingo.py
│ ├── paligemma.py
│ ├── pandagpt.py
│ ├── phi3_vision.py
│ ├── qh_360vl.py
│ ├── qwen_vl.py
│ ├── transcore_m.py
│ ├── visualglm.py
│ ├── vxverse.py
│ ├── wemm.py
│ ├── xcomposer
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-39.pyc
│ │ ├── sharecaptioner.cpython-39.pyc
│ │ ├── xcomposer.cpython-39.pyc
│ │ ├── xcomposer2.cpython-39.pyc
│ │ └── xcomposer2_4KHD.cpython-39.pyc
│ ├── sharecaptioner.py
│ ├── xcomposer.py
│ ├── xcomposer2.py
│ └── xcomposer2_4KHD.py
│ └── yi_vl.py
├── assets
├── overview.jpg
└── taskmap.jpg
├── evaluate.py
└── evaluate_correct.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Best Practice
2 |
3 | **We strongly recommend using [VLMEevalKit](https://github.com/open-compass/VLMEvalKit) for its useful features and ready-to-use LVLM implementations**.
4 |
5 | # MMIU
6 |
7 |
8 | Quick Start |
9 | HomePage |
10 | arXiv |
11 | Dataset |
12 | Citation
13 |
14 |
15 |
16 | This repository is the official implementation of [MMIU](https://arxiv.org/abs/2408.02718).
17 |
18 | > [MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models](https://arxiv.org/abs/2408.02718)
19 | > Fanqing Meng\*, Jin Wang\*, Chuanhao Li\*, Quanfeng Lu, Hao Tian, Jiaqi Liao, Xizhou Zhu, Jifeng Dai, Yu Qiao, Ping Luo, Kaipeng Zhang\#, Wenqi Shao\#
20 | > \* MFQ, WJ and LCH contribute equally.
21 | > \# SWQ (shaowenqi@pjlab.org.cn) and ZKP (zhangkaipeng@pjlab.org.cn) are correponding authors.
22 |
23 | ## 💡 News
24 |
25 | - `2024/08/13`: We have released the codes.
26 |
27 | - `2024/08/08`: We have released the dataset at https://huggingface.co/datasets/FanqingM/MMIU-Benchmark 🔥🔥🔥
28 |
29 | - `2024/08/05`: The datasets and codes are coming soon! 🔥🔥🔥
30 |
31 | - `2024/08/05`: The technical report of [MMIU](https://arxiv.org/abs/2408.02718) is released! And check our [project page](https://mmiu-bench.github.io/)! 🔥🔥🔥
32 |
33 |
34 | ## Introduction
35 | Multimodal Multi-image Understanding (MMIU) benchmark, a comprehensive evaluation suite designed to assess LVLMs across a wide range of multi-image tasks. MMIU encompasses 7 types of multi-image relationships, 52 tasks, 77K images, and 11K meticulously curated multiple-choice questions, making it the most extensive benchmark of its kind.
36 | 
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 | ## Evaluation Results Overview
46 | - The closed-source proprietary model GPT-4o from OpenAI has taken a leading position in MMIU, surpassing other models such as InternVL2-pro, InternVL1.5-chat, Claude3.5-Sonnet, and Gemini1.5 flash. Note that the open-source models InternVL2-pro.
47 |
48 |
49 | - Some powerful LVLMs like InternVL1.5 and GLM4V whose pre-training data do not contain multi-image content even outperform many multi-image models which undergo multi-image supervised fine-tuning (SFT), indicating the strong capacity in single-image understanding is the foundation of multi-image comprehension.
50 | - By comparing performance at the level of image relationships, we conclude that LVLM excels at understanding semantic content in multi-image scenarios but has weaker performance in comprehending temporal and spatial relationships in multi-image contexts.
51 | - The analysis based on the task map reveals that models perform better on high-level understanding tasks such as video captioning which are in-domain tasks, but struggle with 3D perception tasks such as 3D detection and temporal reasoning tasks such as image ordering which are out-of-domain tasks.
52 | - By task learning difficulty analysis, tasks involving ordering, retrieval and massive images cannot be overfitted by simple SFT, suggesting that additional pre-training data or training techniques should be incorporated for improvement.
53 | 
54 |
55 |
56 | ## 🏆 Leaderboard
57 |
58 |
59 |
60 | | Rank | Model | Score |
61 | | ---- | ---------------------- | ----- |
62 | | **1** | **GPT4o** | **55.72** |
63 | | 2 | Gemini | 53.41 |
64 | | 3 | Claude3 | 53.38 |
65 | | **4** | **InternVL2** | **50.30** |
66 | | 5 | Mantis | 45.58 |
67 | | 6 | Gemini1.0 | 40.25 |
68 | | 7 | internvl1.5-chat | 37.39 |
69 | | 8 | Llava-interleave | 32.37 |
70 | | 9 | idefics2_8b | 27.80 |
71 | | 10 | glm-4v-9b | 27.02 |
72 | | 11 | deepseek_vl_7b | 24.64 |
73 | | 12 | XComposer2_1.8b | 23.46 |
74 | | 13 | deepseek_vl_1.3b | 23.21 |
75 | | 14 | flamingov2 | 22.26 |
76 | | 15 | llava_next_vicuna_7b | 22.25 |
77 | | 16 | XComposer2 | 21.91 |
78 | | 17 | MiniCPM-Llama3-V-2_5 | 21.61 |
79 | | 18 | llava_v1.5_7b | 19.19 |
80 | | 19 | sharegpt4v_7b | 18.52 |
81 | | 20 | sharecaptioner | 16.10 |
82 | | 21 | qwen_chat | 15.92 |
83 | | 22 | monkey-chat | 13.74 |
84 | | 23 | idefics_9b_instruct | 12.84 |
85 | | 24 | qwen_base | 5.16 |
86 | | - | Frequency Guess | 31.5 |
87 | | - | Random Guess | 27.4 |
88 |
89 |
90 |
91 |
92 | ## 🚀 Quick Start
93 |
94 | Here, we mainly use the VLMEvalKit framework for testing, with some separate tests as well. Specifically, for multi-image models, we include the following models:
95 |
96 | **transformers == 33.0**
97 |
98 | - `XComposer2`
99 | - `XComposer2_1.8b`
100 | - `qwen_base`
101 | - `idefics_9b_instruct`
102 | - `qwen_chat`
103 | - `flamingov2`
104 |
105 | **transformers == 37.0**
106 | - `deepseek_vl_1.3b`
107 | - `deepseek_vl_7b`
108 |
109 | **transformers == 40.0**
110 |
111 | - `idefics2_8b`
112 |
113 | For single-image models, we include the following:
114 |
115 | **transformers == 33.0**
116 |
117 | - `sharecaptioner`
118 | - `monkey-chat`
119 |
120 | **transformers == 37.0**
121 |
122 | - `sharegpt4v_7b`
123 | - `llava_v1.5_7b`
124 | - `glm-4v-9b`
125 |
126 | **transformers == 40.0**
127 |
128 | - `llava_next_vicuna_7b`
129 | - `MiniCPM-Llama3-V-2_5`
130 |
131 | We use the VLMEvalKit framework for testing. You can refer to the code in `VLMEvalKit/test_models.py`. Additionally, for closed-source models, please replace the following part of the code by following the example here:
132 |
133 | ```python
134 | response = model.generate(tmp) # tmp = image_paths + [question]
135 | ```
136 |
137 | For other open-source models, we have provided reference code for `Mantis` and `InternVL1.5-chat`. For `LLava-Interleave`, please refer to the original repository.
138 |
139 |
140 |
141 |
142 | ## 💐 Acknowledgement
143 |
144 | We expressed sincerely gratitude for the projects listed following:
145 | - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) provides useful out-of-box tools and implements many adavanced LVLMs. Thanks for their selfless dedication.
146 | - The Team of InternVL for apis.
147 |
148 |
149 | ## 📧 Contact
150 | If you have any questions, feel free to contact Fanqing Meng with mengfanqing33@gmail.com
151 |
152 |
153 |
154 | ## 🖊️ Citation
155 | If you feel MMIU useful in your project or research, please kindly use the following BibTeX entry to cite our paper. Thanks!
156 |
157 | ```
158 | @article{meng2024mmiu,
159 | title={MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models},
160 | author={Meng, Fanqing and Wang, Jin and Li, Chuanhao and Lu, Quanfeng and Tian, Hao and Liao, Jiaqi and Zhu, Xizhou and Dai, Jifeng and Qiao, Yu and Luo, Ping and others},
161 | journal={arXiv preprint arXiv:2408.02718},
162 | year={2024}
163 | }
164 | ```
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/apires_scan.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from vlmeval import *
3 | FAIL_MSG = 'Failed to obtain answer via API.'
4 |
5 | root = sys.argv[1]
6 | if root[-1] in '/\\':
7 | root = root[:-1]
8 |
9 | model_name = root.split('/')[-1]
10 | datasets = list(dataset_URLs)
11 |
12 | for d in datasets:
13 | fname = f'{model_name}_{d}.xlsx'
14 | pth = osp.join(root, fname)
15 | if osp.exists(pth):
16 | data = load(pth)
17 | # Detect Failure
18 | assert 'prediction' in data
19 | fail = [FAIL_MSG in x for x in data['prediction']]
20 | if sum(fail):
21 | nfail = sum(fail)
22 | ntot = len(fail)
23 | print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
24 |
25 | eval_files = ls(root, match=f'{model_name}_{d}_')
26 | eval_files = [x for x in eval_files if listinstr(['openai', 'gpt'], x) and x.endswith('.xlsx')]
27 |
28 | assert len(eval_files) == 1
29 | eval_file = eval_files[0]
30 | data = load(eval_file)
31 |
32 | if listinstr(['MathVista', 'MMVet'], d):
33 | bad = [x for x in data['log'] if 'All 5 retries failed.' in x]
34 | if len(bad):
35 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
36 | elif d == 'LLaVABench':
37 | sub = data[data['gpt4_score'] == -1]
38 | sub = sub[sub['gpt4_score'] == -1]
39 | if len(sub):
40 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
41 | else:
42 | bad = [x for x in data['log'] if FAIL_MSG in x]
43 | if len(bad):
44 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
45 |
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/auto_run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from vlmeval.smp import *
3 | from vlmeval.config import supported_VLM
4 |
5 | def is_api(x):
6 | return getattr(supported_VLM[x].func, 'is_api', False)
7 |
8 | models = list(supported_VLM)
9 | models = [x for x in models if 'fs' not in x]
10 | models = [x for x in models if not is_api(x)]
11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2']
12 | models = [x for x in models if x not in exclude_list]
13 |
14 | def is_large(x):
15 | return '80b' in x or 'emu2' in x or '34B' in x
16 |
17 | small_models = [x for x in models if not is_large(x)]
18 | large_models = [x for x in models if is_large(x)]
19 | models = small_models + large_models
20 |
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--data', type=str, nargs='+', required=True)
23 | args = parser.parse_args()
24 |
25 | # Skip some models
26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
27 |
28 | for m in models:
29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
30 | if len(unknown_datasets) == 0:
31 | continue
32 | dataset_str = ' '.join(unknown_datasets)
33 | if '80b' in m:
34 | cmd = f'python run.py --data {dataset_str} --model {m}'
35 | else:
36 | cmd = f'bash run.sh --data {dataset_str} --model {m}'
37 | print(cmd)
38 | os.system(cmd)
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/cover.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3 | cp $DIR/../config.py $DIR/../vlmeval/
4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/mmb_eval_gradio.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.evaluate.multiple_choice import multiple_choice_eval
3 | import gradio as gr
4 |
5 | HEADER = """
6 | # Welcome to MMBench👏👏
7 | We are delighted that you are willing to submit the evaluation results to the MMBench official website! The evaluation service currently can handle submissions of MMBench, MMBench-CN, and CCBench. We use `gpt-3.5-turbo-0613` to help answer matching. Evaluation Codes in VLMEvalKit: https://github.com/open-compass/VLMEvalKit. Please adopt / follow the implementation of VLMEvalKit to generate the submission files.
8 | Moreover, this is a temporary solution, which **does not support ChatGPT-based answer extraction**. So you need to make sure values in the `prediction` field of your submission files are single characters in A, B, C, D. Other free-form answers can not be recognized by the evaluation script and will be marked as **WRONG**.
9 |
10 | The evaluation script is available at https://github.com/open-compass/VLMEvalKit/tree/main/scripts/mmb_eval_gradio.py
11 | Please contact `opencompass@pjlab.org.cn` for any inquirys about this script.
12 | """
13 |
14 | def upload_file(file):
15 | file_path = file.name
16 | return file_path
17 |
18 | def prepare_file(file_name):
19 | file_md5 = md5(file_name)
20 | root = LMUDataRoot()
21 | root = osp.join(root, 'eval_server')
22 | os.makedirs(root, exist_ok=True)
23 | suffix = file_name.split('.')[-1]
24 | if suffix not in ['xlsx', 'tsv', 'csv']:
25 | return False, "Please submit a file that ends with `.xlsx`, `.tsv`, or `.csv`"
26 | new_file_name = osp.join(root, f'{file_md5}.{suffix}')
27 | shutil.move(file_name, new_file_name)
28 | eval_file = new_file_name
29 | try:
30 | data = load(eval_file)
31 | except:
32 | return False, "Your excel file can not be successfully loaded by `pd.read_excel`, please double check and submit again. "
33 | for k in data.keys():
34 | data[k.lower() if k not in 'ABCD' else k] = data.pop(k)
35 | if "index" not in data:
36 | return False, "Your excel file should have a column named `index`, please double check and submit again" , {}
37 | if "prediction" not in data:
38 | return False, "Your excel file should have a column named `prediction`, please double check and submit again" , {}
39 | for ch in 'ABCD':
40 | if ch not in data:
41 | return False, f"Your excel file should have a column named `{ch}`, please double check and submit again" , {}
42 | dump(data, eval_file)
43 | return True, eval_file
44 |
45 | def determine_dataset(eval_file):
46 | data = load(eval_file)
47 | def cn_ratio(data):
48 | iscn = [cn_string(x) for x in data['question']]
49 | return np.mean(iscn)
50 | if len(data) < 2500 and 'l2-category' not in data:
51 | return 'CCBench' if cn_ratio(data) > 0.5 else "Unknown"
52 | else:
53 | return 'MMBench_CN' if cn_ratio(data) > 0.5 else "MMBench"
54 |
55 | def reformat_acc(acc):
56 | splits = set(acc['split'])
57 | keys = list(acc.keys())
58 | keys.remove('split')
59 | nacc = {'Category': []}
60 | for sp in splits:
61 | nacc[sp.upper()] = []
62 | for k in keys:
63 | nacc['Category'].append(k)
64 | for sp in splits:
65 | nacc[sp.upper()].append(acc[acc['split'] == sp].iloc[0][k] * 100)
66 | return pd.DataFrame(nacc)
67 |
68 | def evaluate(file):
69 | file_name = file.name
70 | flag, eval_file = prepare_file(file_name)
71 | if not flag:
72 | return "Error: " + eval_file
73 | dataset = determine_dataset(eval_file)
74 | if dataset == 'Unknown':
75 | return "Error: Cannot determine the dataset given your submitted file. "
76 |
77 | eval_id = eval_file.split('/')[-1].split('.')[0]
78 | ret = f"Evaluation ID: {eval_id}\n"
79 | timestamp = datetime.datetime.now().strftime('%Y.%m.%d %H:%M:%S')
80 | ret += f'Evaluation Timestamp: {timestamp}\n'
81 | acc = multiple_choice_eval(eval_file, dataset=dataset, model='exact_matching')
82 | nacc = reformat_acc(acc).round(1)
83 | return ret, nacc
84 |
85 | with gr.Blocks() as demo:
86 | gr.Markdown(HEADER)
87 | file_output = gr.File()
88 | upload_button = gr.UploadButton("Click to upload you prediction files for a supported benchmark")
89 | upload_button.upload(upload_file, upload_button, file_output)
90 |
91 | btn = gr.Button("🚀 Evaluate")
92 | eval_log = gr.Textbox(label="Evaluation Log", placeholder="Your evaluation log will be displayed here")
93 | df_empty = pd.DataFrame([], columns=['Evaluation Result'])
94 | eval_result = gr.components.DataFrame(value=df_empty)
95 | btn.click(evaluate, inputs=[file_output], outputs=[eval_log, eval_result])
96 |
97 | if __name__ == '__main__':
98 | demo.launch(server_name='0.0.0.0', debug=True, show_error=True)
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | export GPU=$(nvidia-smi --list-gpus | wc -l)
4 | torchrun --nproc-per-node=$GPU run.py ${@:1}
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/summarize.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.utils.dataset_config import dataset_URLs
3 |
4 | def get_score(model, dataset):
5 |
6 | file_name = f'{model}/{model}_{dataset}'
7 | if listinstr([
8 | 'CCBench', 'MMBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset):
9 | file_name += '_acc.csv'
10 | elif listinstr(['MME', 'Hallusion', 'LLaVABench'], dataset):
11 | file_name += '_score.csv'
12 | elif listinstr(['MMVet', 'MathVista'], dataset):
13 | file_name += '_gpt-4-turbo_score.csv'
14 | elif listinstr(['COCO', 'OCRBench'], dataset):
15 | file_name += '_score.json'
16 | else:
17 | raise NotImplementedError
18 |
19 | if not osp.exists(file_name):
20 | return {}
21 |
22 | data = load(file_name)
23 | ret = {}
24 | if dataset == 'CCBench':
25 | ret[dataset] = data['Overall'][0] * 100
26 | elif dataset == 'MMBench':
27 | for n, a in zip(data['split'], data['Overall']):
28 | if n == 'dev':
29 | ret['MMBench_DEV_EN'] = a * 100
30 | elif n == 'test':
31 | ret['MMBench_TEST_EN'] = a * 100
32 | elif dataset == 'MMBench_CN':
33 | for n, a in zip(data['split'], data['Overall']):
34 | if n == 'dev':
35 | ret['MMBench_DEV_CN'] = a * 100
36 | elif n == 'test':
37 | ret['MMBench_TEST_CN'] = a * 100
38 | elif listinstr(['SEEDBench', 'ScienceQA', 'MMBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset):
39 | ret[dataset] = data['Overall'][0] * 100
40 | elif 'MME' == dataset:
41 | ret[dataset] = data['perception'][0] + data['reasoning'][0]
42 | elif 'MMVet' == dataset:
43 | data = data[data['Category'] == 'Overall']
44 | ret[dataset] = float(data.iloc[0]['acc'])
45 | elif 'HallusionBench' == dataset:
46 | data = data[data['split'] == 'Overall']
47 | for met in ['aAcc', 'qAcc', 'fAcc']:
48 | ret[dataset + f' ({met})'] = float(data.iloc[0][met])
49 | elif 'MMMU' in dataset:
50 | data = data[data['split'] == 'validation']
51 | ret['MMMU (val)'] = float(data.iloc[0]['Overall']) * 100
52 | elif 'MathVista' in dataset:
53 | data = data[data['Task&Skill'] == 'Overall']
54 | ret[dataset] = float(data.iloc[0]['acc'])
55 | elif 'LLaVABench' in dataset:
56 | data = data[data['split'] == 'overall'].iloc[0]
57 | ret[dataset] = float(data['Relative Score (main)'])
58 | elif 'OCRBench' in dataset:
59 | ret[dataset] = data['Final Score']
60 |
61 | return ret
62 |
63 | def parse_args():
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument('--data', type=str, nargs='+', default=[])
66 | parser.add_argument("--model", type=str, nargs='+', required=True)
67 | args = parser.parse_args()
68 | return args
69 |
70 | def gen_table(models, datasets):
71 | res = defaultdict(dict)
72 | for m in models:
73 | for d in datasets:
74 | try:
75 | res[m].update(get_score(m, d))
76 | except:
77 | pass
78 | keys = []
79 | for m in models:
80 | for d in res[m]:
81 | keys.append(d)
82 | keys = list(set(keys))
83 | keys.sort()
84 | final = defaultdict(list)
85 | for m in models:
86 | final['Model'].append(m)
87 | for k in keys:
88 | if k in res[m]:
89 | final[k].append(res[m][k])
90 | else:
91 | final[k].append(None)
92 | final = pd.DataFrame(final)
93 | dump(final, 'summ.csv')
94 | if len(final) >= len(final.iloc[0].keys()):
95 | print(tabulate(final))
96 | else:
97 | print(tabulate(final.T))
98 |
99 | if __name__ == '__main__':
100 | args = parse_args()
101 | if args.data == []:
102 | args.data = list(dataset_URLs)
103 | gen_table(args.model, args.data)
--------------------------------------------------------------------------------
/VLMEvalKit/test_Mantis.py:
--------------------------------------------------------------------------------
1 |
2 | import requests
3 | import torch
4 | from PIL import Image
5 | from io import BytesIO
6 |
7 | from transformers import AutoProcessor, AutoModelForVision2Seq
8 | from transformers.image_utils import load_image
9 | import os
10 | import json
11 | import time
12 | import random
13 |
14 |
15 | processor = AutoProcessor.from_pretrained("TIGER-Lab/Mantis-8B-Idefics2") # do_image_splitting is False by default
16 | model = AutoModelForVision2Seq.from_pretrained(
17 | "TIGER-Lab/Mantis-8B-Idefics2",
18 | device_map="auto"
19 | )
20 | generation_kwargs = {
21 | "max_new_tokens": 1024,
22 | "num_beams": 1,
23 | "do_sample": False
24 | }
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | def call_mantis(image_paths,question):
33 | # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
34 | images = []
35 | try:
36 | for image_path in image_paths:
37 | image = load_image(image_path)
38 | images.append(image)
39 | except Exception as e:
40 | print(e)
41 | return 'image error'
42 |
43 | content_list = []
44 | for i in range(len(images)):
45 | content_list.append({"type": "image"})
46 | content_list.append({"type": "text", "text": question})
47 |
48 |
49 | messages = [
50 | {
51 | "role": "user",
52 | "content": content_list
53 | }
54 | ]
55 |
56 | try:
57 | prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
58 | inputs = processor(text=prompt, images=images, return_tensors="pt")
59 | inputs = {k: v.to(model.device) for k, v in inputs.items()}
60 |
61 | # Generate
62 | generated_ids = model.generate(**inputs, **generation_kwargs)
63 | response = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
64 | print("ASSISTANT: ", response[0])
65 | return response[0]
66 | except Exception as e:
67 | print(e)
68 | print('model error')
69 | return 'model error'
70 |
71 |
72 | json_path = 'all.json'
73 |
74 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval',
75 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis',
76 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking']
77 | with open(json_path,'r') as f:
78 | data_all = json.load(f)
79 |
80 | result = []
81 |
82 |
83 |
84 | for modelname in models:
85 | output_dir = os.path.join('./result')
86 |
87 | output_dir = os.path.join(output_dir,modelname)
88 | if not os.path.exists(output_dir):
89 | os.makedirs(output_dir)
90 |
91 | output_path = os.path.join(output_dir,'metadata_info.json')
92 |
93 | for task_data in data_all:
94 |
95 | context = task_data["context"]
96 | question = task_data["question"]
97 |
98 | tmp = []
99 | image_flag = True
100 |
101 | for image_path in task_data["input_image_path"]:
102 |
103 | tmp.append(image_path)
104 | if not os.path.exists(image_path):
105 | image_flag = False
106 | break
107 |
108 | if image_flag == False:
109 | response = 'image none'
110 | task_data[modelname] = response
111 | print(modelname, task,len(tmp), ': ',response)
112 | result.append(task_data)
113 | continue
114 |
115 |
116 |
117 | try:
118 |
119 | if task_data['task'] in tasks_exist:
120 | question = question + '\n' + context
121 | else:
122 | question = context + '\n' + question
123 | question = question + '\nPlease answer the option directly like A,B,C,D...'
124 |
125 | response = call_mantis(tmp,question)
126 | task_data[modelname] = response
127 | print(modelname, task,len(tmp), ': ',response)
128 | except:
129 | response = 'model error or image error'
130 | task_data[modelname] = response
131 | print(modelname, task,len(tmp),': ',response)
132 | result.append(task_data)
133 |
134 |
135 |
136 | with open(output_path,'w') as f:
137 | json.dump(result,f)
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/VLMEvalKit/test_interlvl1.5.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, AutoModel
2 | import torch
3 | import torchvision.transforms as T
4 | from PIL import Image
5 | import random
6 | from torchvision.transforms.functional import InterpolationMode
7 | import os
8 | import json
9 |
10 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
11 | IMAGENET_STD = (0.229, 0.224, 0.225)
12 |
13 |
14 | def build_transform(input_size):
15 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
16 | transform = T.Compose([
17 | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
18 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
19 | T.ToTensor(),
20 | T.Normalize(mean=MEAN, std=STD)
21 | ])
22 | return transform
23 |
24 |
25 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
26 | best_ratio_diff = float('inf')
27 | best_ratio = (1, 1)
28 | area = width * height
29 | for ratio in target_ratios:
30 | target_aspect_ratio = ratio[0] / ratio[1]
31 | ratio_diff = abs(aspect_ratio - target_aspect_ratio)
32 | if ratio_diff < best_ratio_diff:
33 | best_ratio_diff = ratio_diff
34 | best_ratio = ratio
35 | elif ratio_diff == best_ratio_diff:
36 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
37 | best_ratio = ratio
38 | return best_ratio
39 |
40 |
41 | def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
42 | orig_width, orig_height = image.size
43 | aspect_ratio = orig_width / orig_height
44 |
45 | # calculate the existing image aspect ratio
46 | target_ratios = set(
47 | (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
48 | i * j <= max_num and i * j >= min_num)
49 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
50 |
51 | # find the closest aspect ratio to the target
52 | target_aspect_ratio = find_closest_aspect_ratio(
53 | aspect_ratio, target_ratios, orig_width, orig_height, image_size)
54 |
55 | # calculate the target width and height
56 | target_width = image_size * target_aspect_ratio[0]
57 | target_height = image_size * target_aspect_ratio[1]
58 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
59 |
60 | # resize the image
61 | resized_img = image.resize((target_width, target_height))
62 | processed_images = []
63 | for i in range(blocks):
64 | box = (
65 | (i % (target_width // image_size)) * image_size,
66 | (i // (target_width // image_size)) * image_size,
67 | ((i % (target_width // image_size)) + 1) * image_size,
68 | ((i // (target_width // image_size)) + 1) * image_size
69 | )
70 | # split the image
71 | split_img = resized_img.crop(box)
72 | processed_images.append(split_img)
73 | assert len(processed_images) == blocks
74 | if use_thumbnail and len(processed_images) != 1:
75 | thumbnail_img = image.resize((image_size, image_size))
76 | processed_images.append(thumbnail_img)
77 | return processed_images
78 |
79 |
80 | def load_image(image_file, input_size=448, max_num=6):
81 | image = Image.open(image_file).convert('RGB')
82 | transform = build_transform(input_size=input_size)
83 | images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
84 | pixel_values = [transform(image) for image in images]
85 | pixel_values = torch.stack(pixel_values)
86 | return pixel_values
87 |
88 | path = "/mnt/hwfile/gveval/mengfanqing/InternVL-Chat-V1-5"
89 | # If you have an 80G A100 GPU, you can put the entire model on a single GPU.
90 | model = AutoModel.from_pretrained(
91 | path,
92 | torch_dtype=torch.bfloat16,
93 | low_cpu_mem_usage=True,
94 | trust_remote_code=True).eval().cuda()
95 |
96 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
97 |
98 |
99 | generation_config = dict(
100 | num_beams=1,
101 | max_new_tokens=4096,
102 | do_sample=False,
103 | )
104 |
105 |
106 |
107 |
108 | json_path = 'all.json'
109 |
110 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval',
111 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis',
112 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking']
113 | with open(json_path,'r') as f:
114 | data_all = json.load(f)
115 |
116 | result = []
117 |
118 |
119 |
120 | for modelname in models:
121 | output_dir = os.path.join('./result')
122 |
123 | output_dir = os.path.join(output_dir,modelname)
124 | if not os.path.exists(output_dir):
125 | os.makedirs(output_dir)
126 |
127 | output_path = os.path.join(output_dir,'metadata_info.json')
128 |
129 | for task_data in data_all:
130 |
131 | context = task_data["context"]
132 | question = task_data["question"]
133 |
134 | tmp = []
135 | image_flag = True
136 |
137 | for image_path in task_data["input_image_path"]:
138 |
139 | tmp.append(image_path)
140 | if not os.path.exists(image_path):
141 | image_flag = False
142 | break
143 |
144 | if image_flag == False:
145 | response = 'image none'
146 | task_data[modelname] = response
147 | print(modelname, task,len(tmp), ': ',response)
148 | result.append(task_data)
149 | continue
150 |
151 |
152 |
153 | try:
154 |
155 | if task_data['task'] in tasks_exist:
156 | question = question + '\n' + context
157 | else:
158 | question = context + '\n' + question
159 | question = question + '\nPlease answer the option directly like A,B,C,D...'
160 | pixel_values = torch.cat(tmp, dim=0)
161 | response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
162 | task_data[modelname] = response
163 | print(modelname, task,len(tmp), ': ',response)
164 | except:
165 | response = 'model error or image error'
166 | task_data[modelname] = response
167 | print(modelname, task,len(tmp),': ',response)
168 | result.append(task_data)
169 |
170 |
171 |
172 | with open(output_path,'w') as f:
173 | json.dump(result,f)
174 |
175 |
--------------------------------------------------------------------------------
/VLMEvalKit/test_internvl2-pro.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 |
4 | url = "xx" # 替换为实际的API接口(API)
5 | api_key = "InternVL-2-Pro_da046f58b9adc971c2a9f002d8ad4e5704cadf76161268db240bf3afea8b9d78_gI8iJTcO" # 替换为实际生成的API密钥(KEY)
6 |
7 |
8 | # high-level obj
9 | context = "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone.\nThe historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n"
10 | question = "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\nI want to Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found. Which action should I do next?"
11 | question = context + '\n' + question
12 | question = question + '\nPlease answer the option directly like A,B,C,D...'
13 |
14 | file_paths = [
15 | "3873605806112821_0.png",
16 | "3873605806112821_1.png",
17 | "3873605806112821_2.png",
18 | "3873605806112821_3.png"
19 | ]
20 |
21 |
22 |
23 |
24 | files = [('files', open(file_path, 'rb')) for file_path in file_paths]
25 | data = {
26 | 'question': question,
27 | 'api_key': api_key
28 | }
29 |
30 | while True:
31 | try:
32 | response = requests.post(url, files=files, data=data)
33 | if response.status_code == 200:
34 | print("Response:", response.json().get("response", "No response key found in the JSON."))
35 | break # Exit the loop if the request is successful
36 | else:
37 | print("Error:", response.status_code, response.text)
38 | except requests.exceptions.RequestException as e:
39 | print(f"Request failed: {e}")
40 |
41 | # Wait for a while before retrying
42 | time.sleep(2)
43 |
44 |
45 | print('------------------------------')
46 |
--------------------------------------------------------------------------------
/VLMEvalKit/test_models.py:
--------------------------------------------------------------------------------
1 |
2 | from vlmeval.config import supported_VLM
3 | import os
4 | import json
5 |
6 | # transformers == 33.0
7 | # ['XComposer2','XComposer2_1.8b','qwen_base','idefics_9b_instruct','qwen_chat', 'flamingov2']
8 |
9 |
10 | # transformers == 37.0
11 | # ['deepseek_vl_1.3b','deepseek_vl_7b']
12 |
13 | # transformers == 40.0
14 | # ['idefics2_8b']
15 |
16 |
17 | models = ['XComposer2','XComposer2_1.8b','qwen_base','idefics_9b_instruct','qwen_chat', 'flamingov2']
18 | json_path = 'all.json'
19 |
20 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval',
21 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis',
22 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking']
23 | with open(json_path,'r') as f:
24 | data_all = json.load(f)
25 |
26 | result = []
27 |
28 |
29 |
30 | for modelname in models:
31 | model = supported_VLM[modelname]()
32 | output_dir = os.path.join('./result')
33 |
34 | output_dir = os.path.join(output_dir,modelname)
35 | if not os.path.exists(output_dir):
36 | os.makedirs(output_dir)
37 |
38 | output_path = os.path.join(output_dir,'metadata_info.json')
39 |
40 | for task_data in data_all:
41 |
42 | context = task_data["context"]
43 | question = task_data["question"]
44 |
45 | tmp = []
46 | image_flag = True
47 |
48 | for image_path in task_data["input_image_path"]:
49 |
50 | tmp.append(image_path)
51 | if not os.path.exists(image_path):
52 | image_flag = False
53 | break
54 |
55 | if image_flag == False:
56 | response = 'image none'
57 | task_data[modelname] = response
58 | print(modelname, task,len(tmp), ': ',response)
59 | result.append(task_data)
60 | continue
61 |
62 |
63 |
64 | try:
65 |
66 | if task_data['task'] in tasks_exist:
67 | question = question + '\n' + context
68 | else:
69 | question = context + '\n' + question
70 | question = question + '\nPlease answer the option directly like A,B,C,D...'
71 | tmp.append(question)
72 | response = model.generate(tmp)
73 | task_data[modelname] = response
74 | print(modelname, task,len(tmp), ': ',response)
75 | except:
76 | response = 'model error or image error'
77 | task_data[modelname] = response
78 | print(modelname, task,len(tmp),': ',response)
79 | result.append(task_data)
80 |
81 |
82 |
83 | with open(output_path,'w') as f:
84 | json.dump(result,f)
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/VLMEvalKit/utils/__pycache__/conversation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/utils/__pycache__/conversation.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/utils/tools.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/utils/tools.py
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | except ImportError:
4 | pass
5 |
6 | from .smp import *
7 | from .api import *
8 | from .evaluate import *
9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | from .tools import cli
13 |
14 | load_env()
15 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/__pycache__/config.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/config.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/__pycache__/tools.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/tools.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt import OpenAIWrapper, GPT4V
2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal
3 | from .hf_chat_model import HFChatModel
4 | from .gemini import GeminiWrapper, GeminiProVision
5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI
6 | from .qwen_api import QwenAPI
7 | from .stepai import Step1V_INT
8 | from .claude import Claude_Wrapper, Claude3V
9 | from .reka import Reka
10 | from .glm_vision import GLMVisionAPI
11 |
12 | __all__ = [
13 | 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper',
14 | 'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI',
15 | 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI'
16 | ]
17 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/base.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/base.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/claude.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/claude.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/gemini.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gemini.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/glm_vision.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/glm_vision.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/gpt.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gpt.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/gpt_int.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gpt_int.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/hf_chat_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/hf_chat_model.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/qwen_api.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/qwen_api.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/qwen_vl_api.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/qwen_vl_api.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/reka.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/reka.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__pycache__/stepai.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/stepai.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/claude.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 | from time import sleep
4 | import base64
5 | import mimetypes
6 |
7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
8 | headers = {
9 | 'alles-apin-token': '',
10 | 'Content-Type': 'application/json'
11 | }
12 |
13 |
14 | class Claude_Wrapper(BaseAPI):
15 |
16 | is_api: bool = True
17 |
18 | def __init__(self,
19 | model: str = 'claude-3-opus-20240229',
20 | key: str = None,
21 | retry: int = 10,
22 | wait: int = 3,
23 | system_prompt: str = None,
24 | verbose: bool = True,
25 | temperature: float = 0,
26 | max_tokens: int = 1024,
27 | **kwargs):
28 |
29 | self.model = model
30 | self.headers = headers
31 | self.temperature = temperature
32 | self.max_tokens = max_tokens
33 | if key is not None:
34 | self.key = key
35 | else:
36 | self.key = os.environ.get('ALLES', '')
37 | self.headers['alles-apin-token'] = self.key
38 |
39 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
40 |
41 | def build_msgs(self, msgs_raw):
42 |
43 | messages = []
44 | message = {'role': 'user', 'content': []}
45 | for msg in msgs_raw:
46 | if msg['type'] == 'image':
47 | pth = msg['value']
48 | suffix = osp.splitext(pth)[-1].lower()
49 | media_type = mimetypes.types_map.get(suffix, None)
50 | assert media_type is not None
51 |
52 | item = {
53 | 'type': 'image',
54 | 'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)}
55 | }
56 |
57 | elif msg['type'] == 'text':
58 | item = {'type': 'text', 'text': msg['value']}
59 | else:
60 | raise NotImplementedError(f'Unsupported message type: {msg["type"]}')
61 |
62 | message['content'].append(item)
63 | messages.append(message)
64 | return messages
65 |
66 | def generate_inner(self, inputs, **kwargs) -> str:
67 |
68 | payload = json.dumps({
69 | 'model': self.model,
70 | 'max_tokens': self.max_tokens,
71 | 'messages': self.build_msgs(msgs_raw=inputs),
72 | **kwargs
73 | })
74 | response = requests.request('POST', url, headers=headers, data=payload)
75 |
76 | ret_code = response.status_code
77 | retry = self.retry
78 | while ret_code == 429 and retry > 0:
79 | sleep(15)
80 | response = requests.request('POST', url, headers=headers, data=payload)
81 | ret_code = response.status_code
82 | retry -= 1
83 |
84 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
85 | answer = self.fail_msg
86 |
87 | try:
88 | resp_struct = json.loads(response.text)
89 | answer = resp_struct['data']['content'][0]['text'].strip()
90 | except:
91 | pass
92 |
93 | return ret_code, answer, response
94 |
95 |
96 | class Claude3V(Claude_Wrapper):
97 |
98 | def generate(self, message, dataset=None):
99 | return super(Claude_Wrapper, self).generate(message)
100 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/gemini.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 |
4 | headers = 'Content-Type: application/json'
5 |
6 |
7 | class GeminiWrapper(BaseAPI):
8 |
9 | is_api: bool = True
10 |
11 | def __init__(self,
12 | model: str = 'gemini-1.0-pro',
13 | retry: int = 5,
14 | wait: int = 5,
15 | key: str = None,
16 | verbose: bool = True,
17 | temperature: float = 0.0,
18 | system_prompt: str = None,
19 | max_tokens: int = 1024,
20 | proxy: str = None,
21 | backend='genai',
22 | project_id='vlmeval',
23 | **kwargs):
24 |
25 | assert model in ['gemini-1.0-pro', 'gemini-1.5-pro']
26 |
27 | self.model = model
28 | self.fail_msg = 'Failed to obtain answer via API. '
29 | self.max_tokens = max_tokens
30 | self.temperature = temperature
31 | if key is None:
32 | key = os.environ.get('GOOGLE_API_KEY', None)
33 | # Try to load backend from environment variable
34 | be = os.environ.get('GOOGLE_API_BACKEND', None)
35 | if be is not None and be in ['genai', 'vertex']:
36 | backend = be
37 |
38 | assert backend in ['genai', 'vertex']
39 | if backend == 'genai':
40 | assert model == 'gemini-1.0-pro'
41 |
42 | self.backend = backend
43 | self.project_id = project_id
44 |
45 | assert key is not None
46 | self.api_key = key
47 | if proxy is not None:
48 | proxy_set(proxy)
49 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
50 |
51 | def build_msgs_genai(self, inputs):
52 | messages = [] if self.system_prompt is None else [self.system_prompt]
53 | for inp in inputs:
54 | if inp['type'] == 'text':
55 | messages.append(inp['value'])
56 | elif inp['type'] == 'image':
57 | messages.append(Image.open(inp['value']))
58 | return messages
59 |
60 | def build_msgs_vertex(self, inputs):
61 | from vertexai.generative_models import Part, Image
62 | messages = [] if self.system_prompt is None else [self.system_prompt]
63 | for inp in inputs:
64 | if inp['type'] == 'text':
65 | messages.append(inp['value'])
66 | elif inp['type'] == 'image':
67 | messages.append(Part.from_image(Image.load_from_file(inp['value'])))
68 | return messages
69 |
70 | def generate_inner(self, inputs, **kwargs) -> str:
71 | if self.backend == 'genai':
72 | import google.generativeai as genai
73 | assert isinstance(inputs, list)
74 | pure_text = np.all([x['type'] == 'text' for x in inputs])
75 | genai.configure(api_key=self.api_key)
76 | model = genai.GenerativeModel('gemini-pro') if pure_text else genai.GenerativeModel('gemini-pro-vision')
77 | messages = self.build_msgs_genai(inputs)
78 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
79 | gen_config.update(kwargs)
80 | try:
81 | answer = model.generate_content(
82 | messages,
83 | generation_config=genai.types.GenerationConfig(**gen_config)).text
84 | return 0, answer, 'Succeeded! '
85 | except Exception as err:
86 | if self.verbose:
87 | self.logger.error(err)
88 | self.logger.error(f'The input messages are {inputs}.')
89 |
90 | return -1, '', ''
91 | elif self.backend == 'vertex':
92 | import vertexai
93 | from vertexai.generative_models import GenerativeModel
94 | vertexai.init(project=self.project_id, location='us-central1')
95 | model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else 'gemini-1.5-pro'
96 | model = GenerativeModel(model_name=model_name)
97 | messages = self.build_msgs_vertex(inputs)
98 | try:
99 | resp = model.generate_content(messages)
100 | answer = resp.text
101 | return 0, answer, 'Succeeded! '
102 | except Exception as err:
103 | if self.verbose:
104 | self.logger.error(err)
105 | self.logger.error(f'The input messages are {inputs}.')
106 |
107 | return -1, '', ''
108 |
109 |
110 | class GeminiProVision(GeminiWrapper):
111 |
112 | def generate(self, message, dataset=None):
113 | return super(GeminiProVision, self).generate(message)
114 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/glm_vision.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 | from vlmeval.utils.dataset import DATASET_TYPE
4 | from vlmeval.smp.vlm import encode_image_file_to_base64
5 |
6 |
7 | class GLMVisionWrapper(BaseAPI):
8 |
9 | is_api: bool = True
10 |
11 | def __init__(self,
12 | model: str,
13 | retry: int = 5,
14 | wait: int = 5,
15 | key: str = None,
16 | verbose: bool = True,
17 | system_prompt: str = None,
18 | max_tokens: int = 1024,
19 | proxy: str = None,
20 | **kwargs):
21 |
22 | self.model = model
23 | self.fail_msg = 'Failed to obtain answer via API. '
24 | self.default_params = {
25 | 'top_p': 0.6,
26 | 'top_k': 2,
27 | 'temperature': 0.8,
28 | 'repetition_penalty': 1.1,
29 | 'best_of': 1,
30 | 'do_sample': True,
31 | 'stream': False,
32 | 'max_tokens': max_tokens
33 | }
34 | if key is None:
35 | key = os.environ.get('GLMV_API_KEY', None)
36 | assert key is not None, (
37 | 'Please set the API Key (obtain it here: '
38 | 'https://open.bigmodel.cn/dev/howuse/introduction)'
39 | )
40 | self.key = key
41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 |
43 | def image_to_base64(self, image_path):
44 | import base64
45 | with open(image_path, 'rb') as image_file:
46 | encoded_string = base64.b64encode(image_file.read())
47 | return encoded_string.decode('utf-8')
48 |
49 | def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
50 | msgs = cp.deepcopy(msgs_raw)
51 | content = []
52 | text = ''
53 | for i, msg in enumerate(msgs):
54 | if msg['type'] == 'text':
55 | text += msg['value']
56 | elif msg['type'] == 'image':
57 | content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
58 | if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
59 | text += '\nShort Answer.'
60 | content.append(dict(type='text', text=text))
61 | ret = [dict(role='user', content=content)]
62 | return ret
63 |
64 | def generate_inner(self, inputs, **kwargs) -> str:
65 | assert isinstance(inputs, str) or isinstance(inputs, list)
66 | inputs = [inputs] if isinstance(inputs, str) else inputs
67 |
68 | messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
69 |
70 | url = 'https://api.chatglm.cn/v1/chat/completions'
71 | headers = {
72 | 'Content-Type': 'application/json',
73 | 'Request-Id': 'remote-test',
74 | 'Authorization': f'Bearer {self.key}'
75 | }
76 | payload = {
77 | 'model': self.model,
78 | 'messages': messages,
79 | **self.default_params
80 | }
81 | response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
82 | output = []
83 | try:
84 | assert response.status_code == 200
85 | for line in response.iter_lines():
86 | data = json.loads(line.decode('utf-8').lstrip('data: '))
87 | output.append(data['choices'][0]['message']['content'])
88 | answer = ''.join(output).replace('', '')
89 | if self.verbose:
90 | self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
91 | return 0, answer, 'Succeeded! '
92 | except Exception as err:
93 | if self.verbose:
94 | self.logger.error(err)
95 | self.logger.error(f'The input messages are {inputs}.')
96 | return -1, self.fail_msg, ''
97 |
98 |
99 | class GLMVisionAPI(GLMVisionWrapper):
100 |
101 | def generate(self, message, dataset=None):
102 | return super(GLMVisionAPI, self).generate(message, dataset=dataset)
103 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/gpt_int.py:
--------------------------------------------------------------------------------
1 | import json
2 | import warnings
3 | import requests
4 | from ..smp import *
5 | from .gpt import GPT_context_window, OpenAIWrapper
6 |
7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat'
8 | headers = {
9 | 'Content-Type': 'application/json'
10 | }
11 |
12 |
13 | class OpenAIWrapperInternal(OpenAIWrapper):
14 |
15 | is_api: bool = True
16 |
17 | def __init__(self,
18 | model: str = 'gpt-3.5-turbo-0613',
19 | retry: int = 5,
20 | wait: int = 3,
21 | verbose: bool = True,
22 | system_prompt: str = None,
23 | temperature: float = 0,
24 | timeout: int = 60,
25 | max_tokens: int = 1024,
26 | img_size: int = 512,
27 | img_detail: str = 'low',
28 | **kwargs):
29 |
30 | self.model = model
31 | if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']):
32 | keys = load(os.environ['KEYS'])
33 | headers['alles-apin-token'] = keys.get('alles-apin-token', '')
34 | elif 'ALLES' in os.environ:
35 | headers['alles-apin-token'] = os.environ['ALLES']
36 | self.headers = headers
37 | self.temperature = temperature
38 | self.timeout = timeout
39 | self.max_tokens = max_tokens
40 |
41 | assert img_size > 0 or img_size == -1
42 | self.img_size = img_size
43 | assert img_detail in ['high', 'low']
44 | self.img_detail = img_detail
45 |
46 | super(OpenAIWrapper, self).__init__(
47 | wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48 |
49 | def generate_inner(self, inputs, **kwargs) -> str:
50 | input_msgs = self.prepare_inputs(inputs)
51 |
52 | temperature = kwargs.pop('temperature', self.temperature)
53 | max_tokens = kwargs.pop('max_tokens', self.max_tokens)
54 |
55 | # Held out 100 tokens as buffer
56 | context_window = GPT_context_window(self.model)
57 | max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
58 | if 0 < max_tokens <= 100:
59 | print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ')
60 | if max_tokens <= 0:
61 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
62 |
63 | payload = dict(
64 | model=self.model,
65 | messages=input_msgs,
66 | max_tokens=max_tokens,
67 | n=1,
68 | stop=None,
69 | timeout=self.timeout,
70 | temperature=temperature,
71 | **kwargs)
72 |
73 | response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
74 | ret_code = response.status_code
75 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
76 |
77 | answer = self.fail_msg
78 | try:
79 | resp_struct = json.loads(response.text)
80 | assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct
81 | answer = resp_struct['data']['choices'][0]['message']['content'].strip()
82 | except:
83 | pass
84 | return ret_code, answer, response
85 |
86 |
87 | class GPT4V_Internal(OpenAIWrapperInternal):
88 |
89 | def generate(self, message, dataset=None):
90 | return super(GPT4V_Internal, self).generate(message)
91 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/qwen_api.py:
--------------------------------------------------------------------------------
1 | from http import HTTPStatus
2 | import os
3 | from vlmeval.api.base import BaseAPI
4 | from vlmeval.smp import *
5 |
6 |
7 | # Note: This is a pure language model API.
8 | class QwenAPI(BaseAPI):
9 |
10 | is_api: bool = True
11 |
12 | def __init__(self,
13 | model: str = 'qwen-max-1201',
14 | retry: int = 5,
15 | wait: int = 5,
16 | verbose: bool = True,
17 | seed: int = 2680,
18 | temperature: float = 0.0,
19 | system_prompt: str = None,
20 | key: str = None,
21 | max_tokens: int = 1024,
22 | proxy: str = None,
23 | **kwargs):
24 |
25 | assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26 | self.model = model
27 | import dashscope
28 | self.fail_msg = 'Failed to obtain answer via API. '
29 | self.max_tokens = max_tokens
30 | self.temperature = temperature
31 | self.seed = seed
32 | if key is None:
33 | key = os.environ.get('DASHSCOPE_API_KEY', None)
34 | assert key is not None, (
35 | 'Please set the API Key (obtain it here: '
36 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37 | )
38 | dashscope.api_key = key
39 | if proxy is not None:
40 | proxy_set(proxy)
41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 |
43 | @staticmethod
44 | def build_msgs(msgs_raw, system_prompt=None):
45 | msgs = cp.deepcopy(msgs_raw)
46 | ret = []
47 | if system_prompt is not None:
48 | ret.append(dict(role='system', content=system_prompt))
49 | for i, msg in enumerate(msgs):
50 | role = 'user' if i % 2 == 0 else 'assistant'
51 | ret.append(dict(role=role, content=msg))
52 | return ret
53 |
54 | def generate_inner(self, inputs, **kwargs) -> str:
55 | from dashscope import MultiModalConversation
56 | assert isinstance(inputs, str) or isinstance(inputs, list)
57 | inputs = [inputs] if isinstance(inputs, str) else inputs
58 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59 |
60 | import dashscope
61 | response = dashscope.Generation.call(
62 | model=self.model,
63 | messages=messages,
64 | seed=self.seed,
65 | temperature=self.temperature,
66 | max_tokens=self.max_tokens,
67 | result_format='message', # set the result to be "message" format.
68 | )
69 | if response.status_code != HTTPStatus.OK:
70 | return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71 |
72 | try:
73 | return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74 | except Exception as err:
75 | return -1, f'Error: Failed to parse the response. {err}', response
76 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/qwen_vl_api.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 |
4 |
5 | class QwenVLWrapper(BaseAPI):
6 |
7 | is_api: bool = True
8 |
9 | def __init__(self,
10 | model: str = 'qwen-vl-plus',
11 | retry: int = 5,
12 | wait: int = 5,
13 | key: str = None,
14 | verbose: bool = True,
15 | temperature: float = 0.0,
16 | system_prompt: str = None,
17 | max_tokens: int = 1024,
18 | proxy: str = None,
19 | **kwargs):
20 |
21 | assert model in ['qwen-vl-plus', 'qwen-vl-max']
22 | self.model = model
23 | import dashscope
24 | self.fail_msg = 'Failed to obtain answer via API. '
25 | self.max_tokens = max_tokens
26 | self.temperature = temperature
27 | if key is None:
28 | key = os.environ.get('DASHSCOPE_API_KEY', None)
29 | assert key is not None, (
30 | 'Please set the API Key (obtain it here: '
31 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
32 | )
33 | dashscope.api_key = key
34 | if proxy is not None:
35 | proxy_set(proxy)
36 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
37 |
38 | @staticmethod
39 | def build_msgs(msgs_raw, system_prompt=None):
40 | msgs = cp.deepcopy(msgs_raw)
41 | ret = []
42 | if system_prompt is not None:
43 | content = list(dict(text=system_prompt))
44 | ret.append(dict(role='system', content=content))
45 | content = []
46 | for msg in msgs:
47 | if msg['type'] == 'text':
48 | content.append(dict(text=msg['value']))
49 | elif msg['type'] == 'image':
50 | content.append(dict(image='file://' + msg['value']))
51 | ret.append(dict(role='user', content=content))
52 | return ret
53 |
54 | def generate_inner(self, inputs, **kwargs) -> str:
55 | from dashscope import MultiModalConversation
56 | assert isinstance(inputs, str) or isinstance(inputs, list)
57 | pure_text = np.all([x['type'] == 'text' for x in inputs])
58 | assert not pure_text
59 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
60 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
61 | gen_config.update(kwargs)
62 | try:
63 | response = MultiModalConversation.call(model=self.model, messages=messages)
64 | if self.verbose:
65 | print(response)
66 | answer = response.output.choices[0]['message']['content'][0]['text']
67 | return 0, answer, 'Succeeded! '
68 | except Exception as err:
69 | if self.verbose:
70 | self.logger.error(err)
71 | self.logger.error(f'The input messages are {inputs}.')
72 |
73 | return -1, '', ''
74 |
75 |
76 | class QwenVLAPI(QwenVLWrapper):
77 |
78 | def generate(self, message, dataset=None):
79 | return super(QwenVLAPI, self).generate(message)
80 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 | from time import sleep
4 | import mimetypes
5 |
6 |
7 | class Reka_Wrapper(BaseAPI):
8 |
9 | is_api: bool = True
10 | INTERLEAVE: bool = False
11 |
12 | def __init__(self,
13 | model: str = 'reka-flash-20240226',
14 | key: str = None,
15 | retry: int = 10,
16 | wait: int = 3,
17 | system_prompt: str = None,
18 | verbose: bool = True,
19 | temperature: float = 0,
20 | max_tokens: int = 1024,
21 | **kwargs):
22 |
23 | try:
24 | import reka
25 | except ImportError:
26 | raise ImportError('Please install reka by running "pip install reka-api"')
27 |
28 | self.model = model
29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 | default_kwargs.update(kwargs)
31 | self.kwargs = default_kwargs
32 | if key is not None:
33 | self.key = key
34 | else:
35 | self.key = os.environ.get('REKA_API_KEY', '')
36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 |
38 | def generate_inner(self, inputs, **kwargs) -> str:
39 | import reka
40 | reka.API_KEY = self.key
41 | prompt, image_path = self.message_to_promptimg(inputs)
42 | image_b64 = encode_image_file_to_base64(image_path)
43 |
44 | response = reka.chat(
45 | model_name=self.model,
46 | human=prompt,
47 | media_url=f'data:image/jpeg;base64,{image_b64}',
48 | **self.kwargs)
49 |
50 | try:
51 | return 0, response['text'], response
52 | except:
53 | return -1, self.fail_msg, response
54 |
55 |
56 | class Reka(Reka_Wrapper):
57 |
58 | def generate(self, message, dataset=None):
59 | return super(Reka_Wrapper, self).generate(message)
60 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/stepai.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 |
4 | url = 'https://api.stepfun.com/v1/chat/completions'
5 | headers = {
6 | 'Content-Type': 'application/json',
7 | 'Authorization': 'Bearer {}',
8 | }
9 |
10 |
11 | class StepAPI_INT(BaseAPI):
12 |
13 | is_api: bool = True
14 |
15 | def __init__(self,
16 | model: str = 'step-1v-8k',
17 | retry: int = 10,
18 | wait: int = 3,
19 | key: str = None,
20 | temperature: float = 0,
21 | max_tokens: int = 300,
22 | verbose: bool = True,
23 | system_prompt: str = None,
24 | **kwargs):
25 | self.model = model
26 | self.fail_msg = 'Fail to obtain answer via API.'
27 | self.headers = headers
28 | self.temperature = temperature
29 | self.max_tokens = max_tokens
30 | self.system_prompt = system_prompt
31 | if key is not None:
32 | self.key = key
33 | else:
34 | self.key = os.environ.get('STEPAI_API_KEY', '')
35 | headers['Authorization'] = headers['Authorization'].format(self.key)
36 |
37 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
38 |
39 | @staticmethod
40 | def build_msgs(msgs_raw):
41 | messages = []
42 | message = {'role': 'user', 'content': []}
43 |
44 | for msg in msgs_raw:
45 | if msg['type'] == 'image':
46 | image_b64 = encode_image_file_to_base64(msg['value'])
47 | message['content'].append({
48 | 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
49 | 'type': 'image_url'
50 | })
51 | elif msg['type'] == 'text':
52 | message['content'].append({
53 | 'text': msg['value'],
54 | 'type': 'text'
55 | })
56 |
57 | messages.append(message)
58 | return messages
59 |
60 | def generate_inner(self, inputs, **kwargs) -> str:
61 | print(inputs, '\n')
62 | payload = dict(
63 | model=self.model,
64 | max_tokens=self.max_tokens,
65 | temperature=self.temperature,
66 | messages=self.build_msgs(msgs_raw=inputs),
67 | **kwargs)
68 | response = requests.post(url, headers=headers, data=json.dumps(payload))
69 | ret_code = response.status_code
70 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
71 |
72 | answer = self.fail_msg
73 | try:
74 | resp_struct = json.loads(response.text)
75 | answer = resp_struct['choices'][0]['message']['content'].strip()
76 | except:
77 | pass
78 | return ret_code, answer, response
79 |
80 |
81 | class Step1V_INT(StepAPI_INT):
82 |
83 | def generate(self, message, dataset=None):
84 | return super(StepAPI_INT, self).generate(message)
85 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/OCRBench.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 |
3 |
4 | def OCRBench_eval(eval_file):
5 | OCRBench_score = {
6 | 'Regular Text Recognition': 0,
7 | 'Irregular Text Recognition': 0,
8 | 'Artistic Text Recognition': 0,
9 | 'Handwriting Recognition': 0,
10 | 'Digit String Recognition': 0,
11 | 'Non-Semantic Text Recognition': 0,
12 | 'Scene Text-centric VQA': 0,
13 | 'Doc-oriented VQA': 0,
14 | 'Key Information Extraction': 0,
15 | 'Handwritten Mathematical Expression Recognition': 0
16 | }
17 |
18 | logger = get_logger('Evaluation')
19 |
20 | data = load(eval_file)
21 | lt = len(data)
22 | lines = [data.iloc[i] for i in range(lt)]
23 | for i in tqdm(range(len(lines))):
24 | line = lines[i]
25 | predict = str(line['prediction'])
26 | answers = eval(line['answer'])
27 | category = line['category']
28 | if category == 'Handwritten Mathematical Expression Recognition':
29 | for j in range(len(answers)):
30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31 | predict = predict.strip().replace('\n', ' ').replace(' ', '')
32 | if answer in predict:
33 | OCRBench_score[category] += 1
34 | break
35 | else:
36 | for j in range(len(answers)):
37 | answer = answers[j].lower().strip().replace('\n', ' ')
38 | predict = predict.lower().strip().replace('\n', ' ')
39 | if answer in predict:
40 | OCRBench_score[category] += 1
41 | break
42 |
43 | final_score_dict = {}
44 | final_score_dict['Text Recognition'] = (
45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48 | )
49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53 | OCRBench_score['Handwritten Mathematical Expression Recognition']
54 | final_score_dict['Final Score'] = (
55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57 | + final_score_dict['Handwritten Mathematical Expression Recognition']
58 | )
59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60 | score_pth = eval_file.replace('.xlsx', '_score.json')
61 | dump(final_score_dict, score_pth)
62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63 | logger.info('Score: ')
64 | for key, value in final_score_dict.items():
65 | logger.info('{}:{}'.format(key, value))
66 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval
2 | from .mmvet_eval import MMVet_eval
3 | from .multiple_choice import multiple_choice_eval
4 | from .coco_eval import COCO_eval
5 | from .vqa_eval import VQAEval
6 | from .mathvista_eval import MathVista_eval
7 | from .llavabench import LLaVABench_eval
8 | from .misc import build_judge
9 | from .OCRBench import OCRBench_eval
10 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/OCRBench.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/OCRBench.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/coco_eval.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/coco_eval.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/llavabench.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/llavabench.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/mathvista_eval.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/mathvista_eval.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/misc.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/misc.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/mmvet_eval.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/mmvet_eval.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/multiple_choice.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/multiple_choice.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/vqa_eval.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/vqa_eval.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/__pycache__/yes_or_no.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/yes_or_no.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/coco_eval.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from pycocoevalcap.bleu.bleu import Bleu
3 | from pycocoevalcap.rouge.rouge import Rouge
4 | from pycocoevalcap.cider.cider import Cider
5 |
6 |
7 | class COCO_Caption_Scorer():
8 | def __init__(self, ref, gt):
9 | self.ref = ref
10 | self.gt = gt
11 | print('setting up scorers...')
12 | self.scorers = [
13 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
14 | # (Meteor(), "METEOR"), # need java version 11.0.16+
15 | (Rouge(), 'ROUGE_L'),
16 | (Cider(), 'CIDEr'),
17 | # (Spice(), "SPICE"), # need java version 11.0.16+
18 | ]
19 |
20 | def compute_scores(self):
21 | total_scores = {}
22 | for scorer, method in self.scorers:
23 | print('computing %s score...' % (scorer.method()))
24 | score, scores = scorer.compute_score(self.gt, self.ref)
25 | if type(method) == list:
26 | for sc, scs, m in zip(score, scores, method):
27 | print('%s: %0.3f' % (m, sc * 100))
28 | total_scores['Bleu'] = [x * 100 for x in score]
29 | else:
30 | print('%s: %0.3f' % (method, score * 100))
31 | total_scores[method] = score * 100
32 |
33 | print('*****DONE*****')
34 | for key, value in total_scores.items():
35 | print('{}:{}'.format(key, value))
36 | return total_scores
37 |
38 |
39 | def COCO_eval(eval_file, nproc=4, verbose=False):
40 | logger = get_logger('Evaluation')
41 |
42 | data = load(eval_file)
43 |
44 | lt = len(data)
45 | lines = [data.iloc[i] for i in range(lt)]
46 | ref = {}
47 | gt = {}
48 | for i, line in enumerate(lines):
49 | ref[str(i)] = [str(line['prediction'])]
50 | gt[str(i)] = eval(line['answer'])
51 |
52 | scorer = COCO_Caption_Scorer(ref, gt)
53 | coco_caption_score_dict = scorer.compute_scores()
54 |
55 | score_pth = eval_file.replace('.xlsx', '_score.json')
56 | dump(coco_caption_score_dict, score_pth)
57 | logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
58 | logger.info('Score: ')
59 | for key, value in coco_caption_score_dict.items():
60 | logger.info('{}:{}'.format(key, value))
61 |
62 |
63 | def parse_args():
64 | parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
65 | parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
66 | parser.add_argument('--nproc', type=int, default=4)
67 | parser.add_argument('--verbose', action='store_true')
68 | args = parser.parse_args()
69 | return args
70 |
71 |
72 | if __name__ == '__main__':
73 | args = parse_args()
74 | COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
75 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/evaluate/misc.py:
--------------------------------------------------------------------------------
1 | import os
2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
3 | from vlmeval.smp import load_env
4 |
5 | INTERNAL = os.environ.get('INTERNAL', 0)
6 |
7 |
8 | def build_judge(**kwargs):
9 | model = kwargs.pop('model', None)
10 | load_env()
11 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
12 | if LOCAL_LLM is None:
13 | model_map = {
14 | 'gpt-4-turbo': 'gpt-4-1106-preview',
15 | 'gpt-4-0613': 'gpt-4-0613',
16 | 'gpt-4-0314': 'gpt-4-0314',
17 | 'gpt-4-0125': 'gpt-4-0125-preview',
18 | 'chatgpt-1106': 'gpt-3.5-turbo-1106',
19 | 'chatgpt-0613': 'gpt-3.5-turbo-0613',
20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125'
21 | }
22 | model_version = model_map[model]
23 | else:
24 | model_version = LOCAL_LLM
25 | if INTERNAL:
26 | model = OpenAIWrapperInternal(model_version, **kwargs)
27 | else:
28 | model = OpenAIWrapper(model_version, **kwargs)
29 | return model
30 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__pycache__/file.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/file.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__pycache__/log.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/log.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__pycache__/misc.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/misc.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__pycache__/vlm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/vlm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logger_initialized = {}
4 |
5 |
6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
7 | logger = logging.getLogger(name)
8 | if name in logger_initialized:
9 | return logger
10 |
11 | for logger_name in logger_initialized:
12 | if name.startswith(logger_name):
13 | return logger
14 |
15 | stream_handler = logging.StreamHandler()
16 | handlers = [stream_handler]
17 |
18 | try:
19 | import torch.distributed as dist
20 | if dist.is_available() and dist.is_initialized():
21 | rank = dist.get_rank()
22 | else:
23 | rank = 0
24 | except ImportError:
25 | rank = 0
26 |
27 | if rank == 0 and log_file is not None:
28 | file_handler = logging.FileHandler(log_file, file_mode)
29 | handlers.append(file_handler)
30 |
31 | formatter = logging.Formatter(
32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33 | for handler in handlers:
34 | handler.setFormatter(formatter)
35 | handler.setLevel(log_level)
36 | logger.addHandler(handler)
37 |
38 | if rank == 0:
39 | logger.setLevel(log_level)
40 | else:
41 | logger.setLevel(logging.ERROR)
42 |
43 | logger_initialized[name] = True
44 | return logger
45 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/misc.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F401, F403
2 | import abc
3 | import argparse
4 | import csv
5 | import multiprocessing as mp
6 | import os
7 | import os.path as osp
8 | import copy as cp
9 | import random as rd
10 | import requests
11 | import shutil
12 | import subprocess
13 | import warnings
14 | import logging
15 | import pandas as pd
16 | from collections import OrderedDict, defaultdict
17 | from multiprocessing import Pool, current_process
18 | from tqdm import tqdm
19 | import datetime
20 | import matplotlib.pyplot as plt
21 | import seaborn as sns
22 | from tabulate import tabulate_formats, tabulate
23 | from huggingface_hub import scan_cache_dir
24 | from sty import fg, bg, ef, rs
25 |
26 | def process_punctuation(inText):
27 | import re
28 | outText = inText
29 | punct = [
30 | ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
31 | '>', '<', '@', '`', ',', '?', '!'
32 | ]
33 | commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605
34 | periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605
35 | for p in punct:
36 | if (p + ' ' in inText or ' ' + p in inText) or (re.search(
37 | commaStrip, inText) is not None):
38 | outText = outText.replace(p, '')
39 | else:
40 | outText = outText.replace(p, ' ')
41 | outText = periodStrip.sub('', outText, re.UNICODE)
42 | return outText
43 |
44 | def h2r(value):
45 | if value[0] == '#':
46 | value = value[1:]
47 | assert len(value) == 6
48 | return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
49 |
50 | def r2h(rgb):
51 | return '#%02x%02x%02x' % rgb
52 |
53 | def colored(s, color):
54 | if isinstance(color, str):
55 | if hasattr(fg, color):
56 | return getattr(fg, color) + s + fg.rs
57 | color = h2r(color)
58 | return fg(*color) + s + fg.rs
59 |
60 | def istype(s, type):
61 | if isinstance(s, type):
62 | return True
63 | try:
64 | return isinstance(eval(s), type)
65 | except Exception as _:
66 | return False
67 |
68 | def bincount(lst):
69 | bins = defaultdict(lambda: 0)
70 | for item in lst:
71 | bins[item] += 1
72 | return bins
73 |
74 | def get_cache_path(repo_id):
75 | hf_cache_info = scan_cache_dir()
76 | repos = list(hf_cache_info.repos)
77 | repo = None
78 | for r in repos:
79 | if r.repo_id == repo_id:
80 | repo = r
81 | break
82 | if repo is None:
83 | return None
84 | revs = list(repo.revisions)
85 | rev2keep, last_modified = None, 0
86 | for rev in revs:
87 | if rev.last_modified > last_modified:
88 | rev2keep, last_modified = rev, rev.last_modified
89 | if rev2keep is None:
90 | return None
91 | return str(rev2keep.snapshot_path)
92 |
93 | def proxy_set(s):
94 | import os
95 | for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
96 | os.environ[key] = s
97 |
98 | def get_rank_and_world_size():
99 | rank = int(os.environ.get('RANK', 0))
100 | world_size = int(os.environ.get('WORLD_SIZE', 1))
101 | return rank, world_size
102 |
103 | def splitlen(s, sym='/'):
104 | return len(s.split(sym))
105 |
106 | def listinstr(lst, s):
107 | assert isinstance(lst, list)
108 | for item in lst:
109 | if item in s:
110 | return True
111 | return False
112 |
113 | def d2df(D):
114 | return pd.DataFrame({x: [D[x]] for x in D})
115 |
116 | def cn_string(s):
117 | import re
118 | if re.search(u'[\u4e00-\u9fff]', s):
119 | return True
120 | return False
121 |
122 | try:
123 | import decord
124 | except ImportError:
125 | pass
126 |
127 | def timestr(second=True, minute=False):
128 | s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')[2:]
129 | if second:
130 | return s
131 | elif minute:
132 | return s[:-2]
133 | else:
134 | return s[:-4]
135 |
136 | def dict_merge(dct, merge_dct):
137 | for k, _ in merge_dct.items():
138 | if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)): #noqa
139 | dict_merge(dct[k], merge_dct[k])
140 | else:
141 | dct[k] = merge_dct[k]
142 |
143 | def youtube_dl(idx):
144 | cmd = f'youtube-dl -f best -f mp4 "{idx}" -o {idx}.mp4'
145 | os.system(cmd)
146 |
147 | def run_command(cmd):
148 | if isinstance(cmd, str):
149 | cmd = cmd.split()
150 | return subprocess.check_output(cmd).decode()
151 |
152 | def load_env():
153 | logger = logging.getLogger('LOAD_ENV')
154 | try:
155 | import vlmeval
156 | except ImportError:
157 | logger.error('VLMEval is not installed. Failed to import environment variables from .env file. ')
158 | return
159 | pth = osp.realpath(vlmeval.__path__[0])
160 | pth = osp.join(pth, '../.env')
161 | pth = osp.realpath(pth)
162 | if not osp.exists(pth):
163 | logger.error(f'Did not detect the .env file at {pth}, failed to load. ')
164 | return
165 |
166 | from dotenv import dotenv_values
167 | values = dotenv_values(pth)
168 | for k, v in values.items():
169 | if v is not None and len(v):
170 | os.environ[k] = v
171 | logger.info(f'API Keys successfully loaded from {pth}')
172 |
173 | def pip_install_robust(package):
174 | import sys
175 | retry = 3
176 | while retry > 0:
177 | try:
178 | package_base = package.split('=')[0]
179 | module = __import__(package)
180 | return True
181 | except ImportError:
182 | subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
183 | retry -= 1
184 | return False
185 |
186 |
187 | def version_cmp(v1, v2, op='eq'):
188 | from packaging import version
189 | import operator
190 | op_func = getattr(operator, op)
191 | return op_func(version.parse(v1), version.parse(v2))
192 |
193 |
194 | def toliststr(s):
195 | if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
196 | return [str(x) for x in eval(s)]
197 | elif isinstance(s, str):
198 | return [s]
199 | elif isinstance(s, list):
200 | return [str(x) for x in s]
201 | raise NotImplementedError
202 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/vlm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import io
3 | import pandas as pd
4 | import numpy as np
5 | import string
6 | from uuid import uuid4
7 | import os.path as osp
8 | import base64
9 | from PIL import Image
10 | from .file import load, dump
11 | Image.MAX_IMAGE_PIXELS = 1e9
12 |
13 |
14 | def mmqa_display(question, target_size=512):
15 | question = {k.lower(): v for k, v in question.items()}
16 | keys = list(question.keys())
17 | keys = [k for k in keys if k not in ['index', 'image']]
18 |
19 | images = question['image']
20 | if isinstance(images, str):
21 | images = [images]
22 |
23 | idx = question.pop('index', 'XXX')
24 | print(f'INDEX: {idx}')
25 |
26 | for im in images:
27 | image = decode_base64_to_image(im, target_size=target_size)
28 | display(image) # noqa: F821
29 |
30 | for k in keys:
31 | try:
32 | if not pd.isna(question[k]):
33 | print(f'{k.upper()}. {question[k]}')
34 | except ValueError:
35 | if False in pd.isna(question[k]):
36 | print(f'{k.upper()}. {question[k]}')
37 |
38 |
39 | def encode_image_to_base64(img, target_size=-1):
40 | # if target_size == -1, will not do resizing
41 | # else, will set the max_size ot (target_size, target_size)
42 | if img.mode in ('RGBA', 'P'):
43 | img = img.convert('RGB')
44 | tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
45 | if target_size > 0:
46 | img.thumbnail((target_size, target_size))
47 | img.save(tmp)
48 | with open(tmp, 'rb') as image_file:
49 | image_data = image_file.read()
50 | ret = base64.b64encode(image_data).decode('utf-8')
51 | os.remove(tmp)
52 | return ret
53 |
54 |
55 | def encode_image_file_to_base64(image_path, target_size=-1):
56 | image = Image.open(image_path)
57 | return encode_image_to_base64(image, target_size=target_size)
58 |
59 |
60 | def decode_base64_to_image(base64_string, target_size=-1):
61 | image_data = base64.b64decode(base64_string)
62 | image = Image.open(io.BytesIO(image_data))
63 | if image.mode in ('RGBA', 'P'):
64 | image = image.convert('RGB')
65 | if target_size > 0:
66 | image.thumbnail((target_size, target_size))
67 | return image
68 |
69 |
70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
71 | image = decode_base64_to_image(base64_string, target_size=target_size)
72 | image.save(image_path)
73 |
74 |
75 | def build_option_str(option_dict):
76 | s = 'There are several options: \n'
77 | for c, content in option_dict.items():
78 | if not pd.isna(content):
79 | s += f'{c}. {content}\n'
80 | return s
81 |
82 |
83 | def isimg(s):
84 | return osp.exists(s) or s.startswith('http')
85 |
86 |
87 | def read_ok(img_path):
88 | if not osp.exists(img_path):
89 | return False
90 | try:
91 | im = Image.open(img_path)
92 | assert im.size[0] > 0 and im.size[1] > 0
93 | return True
94 | except:
95 | return False
96 |
97 |
98 | def gpt_key_set():
99 | openai_key = os.environ.get('OPENAI_API_KEY', None)
100 | return isinstance(openai_key, str) and openai_key.startswith('sk-')
101 |
102 |
103 | def apiok(wrapper):
104 | s = wrapper.generate('Hello!')
105 | return wrapper.fail_msg not in s
106 |
107 |
108 | def circular_pred(df, extract_func=None):
109 | if extract_func is None:
110 | extract_func = lambda x: x # noqa: E731
111 | df = df.sort_values('index')
112 | from vlmeval.utils import can_infer_option
113 | shift = int(1e6)
114 |
115 | choices = [extract_func(x) for x in df['prediction']]
116 | pred_map = {i: c for i, c in zip(df['index'], choices)}
117 | flag_map = {i: True for i in pred_map if i < 1e6}
118 | valid_map = {i: True for i in pred_map if i < 1e6}
119 | for i in df['index']:
120 | if i >= shift and pred_map[i] and pred_map[i - shift]:
121 | if (
122 | pred_map[i] not in list(string.ascii_uppercase) or # noqa: W504
123 | pred_map[i - shift] not in list(string.ascii_uppercase)
124 | ):
125 |
126 | valid_map[i % shift] = False
127 | continue
128 | if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
129 | continue
130 | else:
131 | flag_map[i % shift] = False
132 | flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
133 | flags = list(flag_map.values())
134 | return np.mean(flags)
135 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .matching_util import can_infer, can_infer_option, can_infer_text
2 | from .mp_util import track_progress_rich
3 | from .custom_prompt import CustomPrompt
4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
5 | from .dataset import TSVDataset, split_MMMU
6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer
7 |
8 |
9 | __all__ = [
10 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
11 | 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
12 | 'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer'
13 | ]
14 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/custom_prompt.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/custom_prompt.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/dataset.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/dataset.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/dataset_config.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/dataset_config.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/mp_util.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/mp_util.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__pycache__/result_transfer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/result_transfer.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/custom_prompt.py:
--------------------------------------------------------------------------------
1 | from ..smp import *
2 | from .dataset_config import img_root_map
3 | from abc import abstractmethod
4 |
5 |
6 | class CustomPrompt:
7 |
8 | @abstractmethod
9 | def use_custom_prompt(self, dataset):
10 | raise NotImplementedError
11 |
12 | @abstractmethod
13 | def build_prompt(self, line, dataset):
14 | raise NotImplementedError
15 |
16 | def dump_image(self, line, dataset):
17 | ROOT = LMUDataRoot()
18 | assert isinstance(dataset, str)
19 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
20 | os.makedirs(img_root, exist_ok=True)
21 |
22 | if 'image' in line:
23 | if isinstance(line['image'], list):
24 | tgt_path = []
25 | assert 'image_path' in line
26 | for img, im_name in zip(line['image'], line['image_path']):
27 | path = osp.join(img_root, im_name)
28 | if not read_ok(path):
29 | decode_base64_to_image_file(img, path)
30 | tgt_path.append(path)
31 | else:
32 | tgt_path = osp.join(img_root, f"{line['index']}.jpg")
33 | if not read_ok(tgt_path):
34 | decode_base64_to_image_file(line['image'], tgt_path)
35 | tgt_path = [tgt_path]
36 | else:
37 | assert 'image_path' in line
38 | tgt_path = toliststr(line['image_path'])
39 |
40 | return tgt_path
41 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/dataset.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import hashlib
3 | from ..smp import *
4 | from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE
5 | from .custom_prompt import CustomPrompt
6 |
7 |
8 | def check_md5(data_path, dataset):
9 | if dataset not in dataset_md5_dict:
10 | warnings.warn(f'We do not have an md5 record for dataset {dataset}, skip the md5 check. ')
11 | return True
12 | assert osp.exists(data_path)
13 | with open(data_path, 'rb') as f:
14 | hash = hashlib.new('md5')
15 | for chunk in iter(lambda: f.read(2**20), b''):
16 | hash.update(chunk)
17 | if str(hash.hexdigest()) == dataset_md5_dict[dataset]:
18 | return True
19 | else:
20 | warnings.warn('this data file is incomplete, so it needs to be downloaded again.')
21 | return False
22 |
23 |
24 | def split_MMMU(msgs):
25 | text, images = None, []
26 | for s in msgs:
27 | if s['type'] == 'image':
28 | images.append(s['value'])
29 | elif s['type'] == 'text':
30 | assert text is None
31 | text = s['value']
32 | text_segs = text.split(''
38 | image_idx = int(seg[0]) - 1
39 | segs.append(dict(type='image', value=images[image_idx]))
40 | segs.append(dict(type='text', value=seg[2:]))
41 | return segs
42 |
43 |
44 | def prep_tsv(dataset):
45 | data_root = LMUDataRoot()
46 | assert osp.exists(data_root)
47 | update_flag = False
48 |
49 | if dataset in dataset_URLs:
50 | url = dataset_URLs[dataset]
51 | file_name = url.split('/')[-1]
52 | data_path = osp.join(data_root, file_name)
53 |
54 | if osp.exists(data_path) and check_md5(data_path, dataset):
55 | pass
56 | else:
57 | warnings.warn('The dataset tsv is not downloaded')
58 | download_file(url, data_path)
59 | update_flag = True
60 | else:
61 | data_path = osp.join(data_root, dataset + '.tsv')
62 | assert osp.exists(data_path)
63 |
64 | if file_size(data_path, 'GB') > 1:
65 | local_path = data_path.replace('.tsv', '_local.tsv')
66 | if not osp.exists(local_path) or update_flag or os.environ.get('FORCE_LOCAL', None):
67 | from ..tools import LOCALIZE
68 | LOCALIZE(data_path, local_path)
69 | return local_path
70 | else:
71 | return data_path
72 |
73 |
74 | class TSVDataset(CustomPrompt):
75 |
76 | def __init__(self, dataset='MMBench', skip_noimg=True):
77 |
78 | self.data_root = LMUDataRoot()
79 | self.dataset = dataset
80 | self.dataset_type = DATASET_TYPE(dataset)
81 | self.data_path = prep_tsv(dataset)
82 | data = load(self.data_path)
83 |
84 | self.skip_noimg = skip_noimg
85 | if skip_noimg and 'image' in data:
86 | data = data[~pd.isna(data['image'])]
87 |
88 | # Prompt for Captioning
89 | if listinstr(['COCO'], dataset):
90 | data['question'] = [(
91 | 'Please describe this image in general. Directly provide the description, '
92 | 'do not include prefix like "This image depicts". '
93 | )] * len(data)
94 |
95 | data['index'] = [str(x) for x in data['index']]
96 |
97 | self.meta_only = True
98 | if 'image' in data:
99 | data['image'] = [str(x) for x in data['image']]
100 |
101 | image_map = {x: y for x, y in zip(data['index'], data['image'])}
102 | for k in image_map:
103 | if len(image_map[k]) <= 64:
104 | idx = image_map[k]
105 | assert idx in image_map and len(image_map[idx]) > 64
106 | image_map[k] = image_map[idx]
107 |
108 | images = [toliststr(image_map[k]) for k in data['index']]
109 | data['image'] = [x[0] if len(x) == 1 else x for x in images]
110 | self.meta_only = False
111 |
112 | if 'image_path' in data:
113 | paths = [toliststr(x) for x in data['image_path']]
114 | data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
115 |
116 | if np.all([istype(x, int) for x in data['index']]):
117 | data['index'] = [int(x) for x in data['index']]
118 |
119 | self.data = data
120 |
121 | def __len__(self):
122 | return len(self.data)
123 |
124 | def build_prompt(self, line, dataset=None):
125 | if dataset is None:
126 | dataset = self.dataset
127 |
128 | if isinstance(line, int):
129 | line = self.data.iloc[line]
130 |
131 | if self.meta_only:
132 | tgt_path = toliststr(line['image_path'])
133 | else:
134 | tgt_path = self.dump_image(line, dataset)
135 |
136 | prompt = line['question']
137 | if DATASET_TYPE(dataset) == 'multi-choice':
138 | question = line['question']
139 | options = {
140 | cand: line[cand]
141 | for cand in string.ascii_uppercase
142 | if cand in line and not pd.isna(line[cand])
143 | }
144 | options_prompt = 'Options:\n'
145 | for key, item in options.items():
146 | options_prompt += f'{key}. {item}\n'
147 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
148 | prompt = ''
149 | if hint is not None:
150 | prompt += f'Hint: {hint}\n'
151 | prompt += f'Question: {question}\n'
152 | if len(options):
153 | prompt += options_prompt
154 | prompt += 'Please select the correct answer from the options above. \n'
155 | elif DATASET_TYPE(dataset) == 'VQA':
156 | if listinstr(['ocrvqa', 'textvqa', 'chartqa', 'docvqa'], dataset.lower()):
157 | prompt += '\nAnswer the question using a single word or phrase.\n'
158 |
159 | msgs = []
160 | if isinstance(tgt_path, list):
161 | msgs.extend([dict(type='image', value=p) for p in tgt_path])
162 | else:
163 | msgs = [dict(type='image', value=tgt_path)]
164 | msgs.append(dict(type='text', value=prompt))
165 |
166 | return msgs
167 |
168 | def display(self, line):
169 | if isinstance(line, int):
170 | line = self.data.iloc[line]
171 | mmqa_display(line)
172 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
1 | import string
2 | import copy as cp
3 | import os
4 | from ..smp import *
5 |
6 |
7 | def can_infer_option(answer, choices):
8 | verbose = os.environ.get('VERBOSE', 0)
9 | # Choices is a dictionary
10 | if 'Failed to obtain answer via API' in answer:
11 | return False
12 |
13 | reject_to_answer = [
14 | "Sorry, I can't help with images of people yet.",
15 | "I can't process this file.",
16 | "I'm sorry, but without the image provided",
17 | 'Cannot determine the answer'
18 | ]
19 | for err in reject_to_answer:
20 | if err in answer:
21 | return 'Z'
22 |
23 | def count_choice(splits, choices, prefix='', suffix=''):
24 | cnt = 0
25 | for c in choices:
26 | if prefix + c + suffix in splits:
27 | cnt += 1
28 | return cnt
29 |
30 | answer_mod = cp.copy(answer)
31 | chars = '.()[],:;!*#{}'
32 | for c in chars:
33 | answer_mod = answer_mod.replace(c, ' ')
34 |
35 | splits = [x.strip() for x in answer_mod.split()]
36 | count = count_choice(splits, choices)
37 |
38 | if count == 1:
39 | for ch in choices:
40 | if 'A' in splits and len(splits) > 3 and verbose:
41 | logger = get_logger('Evaluation')
42 | logger.info(f'A might be a quantifier in the string: {answer}.')
43 | return False
44 | if ch in splits:
45 | return ch
46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 | return 'Z'
48 | return False
49 |
50 |
51 | def can_infer_text(answer, choices):
52 | answer = answer.lower()
53 | assert isinstance(choices, dict)
54 | for k in choices:
55 | assert k in string.ascii_uppercase
56 | choices[k] = str(choices[k]).lower()
57 | cands = []
58 | for k in choices:
59 | if choices[k] in answer:
60 | cands.append(k)
61 | if len(cands) == 1:
62 | return cands[0]
63 | return False
64 |
65 |
66 | def can_infer(answer, choices):
67 | answer = str(answer)
68 | copt = can_infer_option(answer, choices)
69 | return copt if copt else can_infer_text(answer, choices)
70 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/result_transfer.py:
--------------------------------------------------------------------------------
1 | from ..evaluate.misc import build_judge
2 | from ..evaluate.multiple_choice import extract_answer_from_item
3 |
4 | from ..smp import *
5 | from .matching_util import can_infer
6 | from .mp_util import track_progress_rich
7 |
8 |
9 | def MMMU_result_transfer(result_path):
10 | res = {}
11 | result_data = load(result_path)
12 | mcq = result_data['A'].notna()
13 | lt = len(result_data)
14 | for i in range(lt):
15 | line = result_data.iloc[i]
16 | if mcq[i]:
17 | options = {
18 | cand: line[cand]
19 | for cand in string.ascii_uppercase
20 | if cand in line and not pd.isna(line[cand])
21 | }
22 | prediction = line['prediction']
23 | infer_prediction = can_infer(prediction, options)
24 | res[line['id']] = infer_prediction
25 | else:
26 | res[line['id']] = line['prediction']
27 | result_json = result_path.replace('.xlsx', '.json')
28 | dump(res, result_json)
29 | return result_json
30 |
31 |
32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
33 | logger = get_logger('Evaluation')
34 | INTERNAL = os.environ.get('INTERNAL', 0)
35 | nproc = judge_kwargs.pop('nproc', 4)
36 |
37 | rd.seed(2680)
38 | suffix = eval_file.split('.')[-1]
39 | model = judge_kwargs['model']
40 | assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
41 | name_str_map = {
42 | 'chatgpt-0613': 'openai',
43 | 'gpt-4-0125': 'gpt4'
44 | }
45 | name_str = name_str_map[model] if model in name_str_map else model
46 |
47 | if model == 'exact_matching':
48 | model = None
49 | else:
50 | if INTERNAL or gpt_key_set():
51 | model = build_judge(**judge_kwargs)
52 | else:
53 | logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
54 | model = None
55 |
56 | logger.info(f'Evaluating {eval_file}')
57 | result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
58 | result = {}
59 | if osp.exists(result_file):
60 | result = load(result_file)
61 |
62 | data = load(eval_file)
63 | assert 'index' in data, 'Essentail columns missing in the eval_file.'
64 |
65 | data = data.sort_values(by='index')
66 | data['prediction'] = [str(x) for x in data['prediction']]
67 | for k in data.keys():
68 | data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
69 |
70 | idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
71 | idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
72 |
73 | indices = list(idx2lines.keys())
74 | lines = [idx2lines[i] for i in indices]
75 | tups = [(model, line) for line in lines]
76 | res = track_progress_rich(
77 | extract_answer_from_item,
78 | tups,
79 | nproc=nproc,
80 | chunksize=nproc,
81 | save=result_file,
82 | keys=indices)
83 |
84 | for i, r in zip(indices, res):
85 | if i in result:
86 | assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
87 | else:
88 | result[i] = r
89 |
90 | indices = list(data['index'])
91 | data['opt'] = [result[i]['opt'] for i in data['index']]
92 | data['log'] = [result[i]['log'] for i in data['index']]
93 |
94 | # load split
95 | output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
96 | dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
97 | return output_path
98 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | torch.set_grad_enabled(False)
4 | torch.manual_seed(1234)
5 | from .base import BaseModel
6 | from .cogvlm import CogVlm, GLM4v
7 | from .emu import Emu
8 | from .idefics import IDEFICS, IDEFICS2
9 | from .instructblip import InstructBLIP
10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner
11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V
12 | from .minigpt4 import MiniGPT4
13 | from .mmalaya import MMAlaya
14 | from .monkey import Monkey, MonkeyChat
15 | from .mplug_owl2 import mPLUG_Owl2
16 | from .omnilmm import OmniLMM12B
17 | from .open_flamingo import OpenFlamingo
18 | from .pandagpt import PandaGPT
19 | from .qwen_vl import QwenVL, QwenVLChat
20 | from .transcore_m import TransCoreM
21 | from .visualglm import VisualGLM
22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD
23 | from .yi_vl import Yi_VL
24 | from .internvl_chat import InternVLChat
25 | from .deepseek_vl import DeepSeekVL
26 | from .mgm import Mini_Gemini
27 | from .bunnyllama3 import BunnyLLama3
28 | from .vxverse import VXVERSE
29 | from .paligemma import PaliGemma
30 | from .qh_360vl import QH_360VL
31 | from .phi3_vision import Phi3Vision
32 | from .wemm import WeMM
33 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/bunnyllama3.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/bunnyllama3.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/deepseek_vl.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/deepseek_vl.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/emu.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/emu.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/internvl_chat.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/internvl_chat.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/minicpm_v.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/minicpm_v.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/mmalaya.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mmalaya.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/omnilmm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/omnilmm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/qwen_vl.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/qwen_vl.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/vxverse.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/vxverse.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/base.py:
--------------------------------------------------------------------------------
1 | from ..smp import *
2 | from ..utils.dataset_config import img_root_map
3 | from abc import abstractmethod
4 |
5 |
6 | class BaseModel:
7 |
8 | INTERLEAVE = False
9 | allowed_types = ['text', 'image']
10 |
11 | def use_custom_prompt(self, dataset):
12 | """Whether to use custom prompt for the given dataset.
13 |
14 | Args:
15 | dataset (str): The name of the dataset.
16 |
17 | Returns:
18 | bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt.
19 | Default to False.
20 | """
21 | return False
22 |
23 | @abstractmethod
24 | def build_prompt(self, line, dataset):
25 | """Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True.
26 |
27 | Args:
28 | line (line of pd.DataFrame): The raw input line.
29 | dataset (str): The name of the dataset.
30 |
31 | Returns:
32 | str: The built message.
33 | """
34 | raise NotImplementedError
35 |
36 | def dump_image(self, line, dataset):
37 | """Dump the image(s) of the input line to the corresponding dataset folder.
38 |
39 | Args:
40 | line (line of pd.DataFrame): The raw input line.
41 | dataset (str): The name of the dataset.
42 |
43 | Returns:
44 | str | list[str]: The paths of the dumped images.
45 | """
46 | ROOT = LMUDataRoot()
47 | assert isinstance(dataset, str)
48 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
49 | os.makedirs(img_root, exist_ok=True)
50 | if 'image' in line:
51 | if isinstance(line['image'], list):
52 | tgt_path = []
53 | assert 'image_path' in line
54 | for img, im_name in zip(line['image'], line['image_path']):
55 | path = osp.join(img_root, im_name)
56 | if not read_ok(path):
57 | decode_base64_to_image_file(img, path)
58 | tgt_path.append(path)
59 | else:
60 | tgt_path = osp.join(img_root, f"{line['index']}.jpg")
61 | if not read_ok(tgt_path):
62 | decode_base64_to_image_file(line['image'], tgt_path)
63 | tgt_path = [tgt_path]
64 | else:
65 | assert 'image_path' in line
66 | tgt_path = toliststr(line['image_path'])
67 |
68 | return tgt_path
69 |
70 | @abstractmethod
71 | def generate_inner(self, message, dataset=None):
72 | raise NotImplementedError
73 |
74 | def check_content(self, msgs):
75 | """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
76 | """
77 | if isinstance(msgs, str):
78 | return 'str'
79 | if isinstance(msgs, dict):
80 | return 'dict'
81 | if isinstance(msgs, list):
82 | types = [self.check_content(m) for m in msgs]
83 | if all(t == 'str' for t in types):
84 | return 'liststr'
85 | if all(t == 'dict' for t in types):
86 | return 'listdict'
87 | return 'unknown'
88 |
89 | def preproc_content(self, inputs):
90 | """Convert the raw input messages to a list of dicts.
91 |
92 | Args:
93 | inputs: raw input messages.
94 |
95 | Returns:
96 | list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
97 | """
98 | if self.check_content(inputs) == 'str':
99 | return [dict(type='text', value=inputs)]
100 | elif self.check_content(inputs) == 'dict':
101 | assert 'type' in inputs and 'value' in inputs
102 | return [inputs]
103 | elif self.check_content(inputs) == 'liststr':
104 | res = []
105 | for s in inputs:
106 | mime, pth = parse_file(s)
107 | if mime is None or mime == 'unknown':
108 | res.append(dict(type='text', value=s))
109 | else:
110 | res.append(dict(type=mime.split('/')[0], value=pth))
111 | return res
112 | elif self.check_content(inputs) == 'listdict':
113 | for item in inputs:
114 | assert 'type' in item and 'value' in item
115 | mime, s = parse_file(item['value'])
116 | if mime is None:
117 | assert item['type'] == 'text'
118 | else:
119 | assert mime.split('/')[0] == item['type']
120 | item['value'] = s
121 | return inputs
122 | else:
123 | return None
124 |
125 | def generate(self, message, dataset=None):
126 | """Generate the output message.
127 |
128 | Args:
129 | message (list[dict]): The input message.
130 | dataset (str, optional): The name of the dataset. Defaults to None.
131 |
132 | Returns:
133 | str: The generated message.
134 | """
135 | assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
136 | message = self.preproc_content(message)
137 | assert message is not None and self.check_content(message) == 'listdict'
138 | for item in message:
139 | assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
140 | return self.generate_inner(message, dataset)
141 |
142 | def message_to_promptimg(self, message):
143 | assert not self.INTERLEAVE
144 | model_name = self.__class__.__name__
145 | warnings.warn(
146 | f'Model {model_name} does not support interleaved input. '
147 | 'Will use the first image and aggregated texts as prompt. ')
148 | num_images = len([x for x in message if x['type'] == 'image'])
149 | if num_images == 0:
150 | prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
151 | image = None
152 | else:
153 | prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
154 | image = [x['value'] for x in message if x['type'] == 'image'][0]
155 | return prompt, image
156 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/bunnyllama3.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from PIL import Image
5 | import warnings
6 |
7 | from .base import BaseModel
8 | from ..smp import *
9 | from ..utils import DATASET_TYPE
10 |
11 |
12 | class BunnyLLama3(BaseModel):
13 |
14 | INSTALL_REQ = False
15 | INTERLEAVE = False
16 |
17 | def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs):
18 | assert model_path is not None
19 | transformers.logging.set_verbosity_error()
20 | transformers.logging.disable_progress_bar()
21 | warnings.filterwarnings('ignore')
22 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
23 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
24 | self.kwargs = kwargs
25 |
26 | def generate_inner(self, message, dataset=None):
27 | prompt, image_path = self.message_to_promptimg(message)
28 | text = f"A chat between a curious user and an artificial intelligence assistant. \
29 | The assistant gives helpful, detailed, and polite answers to the user's questions. \
30 | USER: \n{prompt} ASSISTANT:"
31 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')]
32 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
33 | image = Image.open(image_path).convert('RGB')
34 | image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype)
35 |
36 | output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0]
37 | response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
38 | return response
39 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/cogvlm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | from .base import BaseModel
4 | from ..smp import *
5 | from ..utils import DATASET_TYPE
6 | from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer
7 |
8 |
9 | class GLM4v(BaseModel):
10 |
11 | INSTALL_REQ = False
12 | INTERLEAVE = False
13 |
14 | def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs):
15 | assert model_path is not None
16 | self.model_path = model_path
17 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
18 | self.model = AutoModelForCausalLM.from_pretrained(
19 | model_path,
20 | torch_dtype=torch.bfloat16,
21 | low_cpu_mem_usage=True,
22 | trust_remote_code=True
23 | ).to('cuda').eval()
24 | gen_kwargs = {'max_length': 2048, 'do_sample': False}
25 | gen_kwargs.update(kwargs)
26 | self.kwargs = gen_kwargs
27 | self.end_text_token = '<|endoftext|>'
28 |
29 | def generate_inner(self, message, dataset=None):
30 | prompt, image_path = self.message_to_promptimg(message)
31 | image = Image.open(image_path).convert('RGB')
32 | inputs = self.tokenizer.apply_chat_template(
33 | [{'role': 'user', 'image': image, 'content': prompt}],
34 | add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True
35 | )
36 | inputs = inputs.to('cuda')
37 |
38 | with torch.no_grad():
39 | outputs = self.model.generate(**inputs, **self.kwargs)
40 | outputs = outputs[:, inputs['input_ids'].shape[1]:]
41 | response = self.tokenizer.decode(outputs[0])
42 | return response.split(self.end_text_token)[0]
43 |
44 |
45 | class CogVlm(BaseModel):
46 |
47 | INSTALL_REQ = False
48 | INTERLEAVE = False
49 |
50 | def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs):
51 | assert model_path is not None
52 | model = AutoModelForCausalLM.from_pretrained(
53 | model_path,
54 | torch_dtype=torch.bfloat16,
55 | trust_remote_code=True,
56 | ).to('cuda').eval()
57 |
58 | self.kwargs = kwargs
59 | if tokenizer_name:
60 | tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)
61 | gen_kwargs = {'max_length': 2048, 'do_sample': False}
62 | self.end_text_token = ''
63 | else:
64 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
65 | gen_kwargs = {'max_new_tokens': 2048, 'pad_token_id': 128002}
66 | self.end_text_token = '<|end_of_text|>'
67 | self.kwargs.update(gen_kwargs)
68 | self.tokenizer = tokenizer
69 | self.model = model
70 |
71 | def use_custom_prompt(self, dataset):
72 | assert dataset is not None
73 | if DATASET_TYPE(dataset) == 'multi-choice':
74 | return True
75 | return False
76 |
77 | def build_prompt(self, line, dataset=None):
78 | assert dataset is None or isinstance(dataset, str)
79 | assert self.use_custom_prompt(dataset)
80 | tgt_path = self.dump_image(line, dataset)
81 |
82 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
83 | question = line['question']
84 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
85 | if hint is not None:
86 | question = hint + '\n' + question
87 |
88 | option_candidate = string.ascii_uppercase
89 | options = {
90 | cand: line[cand]
91 | for cand in option_candidate
92 | if cand in line and not pd.isna(line[cand])
93 | }
94 | for key, item in options.items():
95 | question += f'\n{key}. {item}'
96 | prompt = question
97 |
98 | if not cn_string(prompt):
99 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
100 | else:
101 | prompt = prompt + '\n' + '请直接回答选项字母。'
102 | else:
103 | prompt = line['question']
104 | message = [dict(type='text', value=prompt)]
105 | message.extend([dict(type='image', value=p) for p in tgt_path])
106 |
107 | return message
108 |
109 | def generate_inner(self, message, dataset=None):
110 | prompt, image_path = self.message_to_promptimg(message)
111 |
112 | image = Image.open(image_path).convert('RGB')
113 | inputs = self.model.build_conversation_input_ids(
114 | self.tokenizer, query=prompt, history=[], images=[image]) # chat mode
115 | inputs = {
116 | 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
117 | 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
118 | 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
119 | 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]],
120 | }
121 |
122 | with torch.no_grad():
123 | outputs = self.model.generate(**inputs, **self.kwargs)
124 | outputs = outputs[:, inputs['input_ids'].shape[1]:]
125 | response = self.tokenizer.decode(outputs[0])
126 | response = response.split(self.end_text_token)[0].strip()
127 | return response
128 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/deepseek_vl.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | from transformers import AutoModelForCausalLM
4 | import warnings
5 | from .base import BaseModel
6 |
7 |
8 | class DeepSeekVL(BaseModel):
9 |
10 | INSTALL_REQ = True
11 | INTERLEAVE = True
12 |
13 | def check_install(self):
14 | try:
15 | import deepseek_vl
16 | except ImportError:
17 | warnings.warn(
18 | 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
19 | sys.exit(-1)
20 |
21 | def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs):
22 | self.check_install()
23 | assert model_path is not None
24 | self.model_path = model_path
25 | from deepseek_vl.models import VLChatProcessor
26 |
27 | self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
28 | self.tokenizer = self.vl_chat_processor.tokenizer
29 |
30 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
31 | self.model = model.to(torch.bfloat16).cuda().eval()
32 |
33 | torch.cuda.empty_cache()
34 | default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
35 | default_kwargs.update(kwargs)
36 | self.kwargs = default_kwargs
37 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
38 |
39 | def prepare_inputs(self, message):
40 | content, images = '', []
41 | for s in message:
42 | if s['type'] == 'image':
43 | images.append(s['value'])
44 | content += ''
45 | elif s['type'] == 'text':
46 | content += s['value']
47 | conversation = [
48 | dict(role='User', content=content, images=images),
49 | dict(role='Assistant', content='')
50 | ]
51 | return conversation
52 |
53 | def generate_inner(self, message, dataset=None):
54 | conversation = self.prepare_inputs(message)
55 | from deepseek_vl.utils.io import load_pil_images
56 | pil_images = load_pil_images(conversation)
57 | prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
58 | prepare_inputs = prepare_inputs.to(self.model.device)
59 | inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
60 |
61 | outputs = self.model.language_model.generate(
62 | inputs_embeds=inputs_embeds,
63 | attention_mask=prepare_inputs.attention_mask,
64 | pad_token_id=self.tokenizer.eos_token_id,
65 | bos_token_id=self.tokenizer.bos_token_id,
66 | eos_token_id=self.tokenizer.eos_token_id,
67 | **self.kwargs)
68 | answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
69 | return answer
70 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/emu.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from PIL import Image
4 | import os.path as osp
5 | from .base import BaseModel
6 | from ..smp import *
7 |
8 |
9 | class Emu(BaseModel):
10 |
11 | INSTALL_REQ = False
12 | INTERLEAVE = True
13 |
14 | def __init__(self,
15 | model_path='BAAI/Emu2-Chat',
16 | **kwargs):
17 |
18 | self.model_path = model_path
19 | assert osp.exists(model_path) or splitlen(model_path) == 2
20 |
21 | from transformers import AutoModelForCausalLM, AutoTokenizer
22 | from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
23 |
24 | local_rank = os.environ.get('LOCAL_RANK', 0)
25 |
26 | device_num = torch.cuda.device_count()
27 | assert local_rank * 2 <= device_num, 'The number of devices does not match the world size'
28 | assert device_num >= 2, 'You need at least 2 GPUs to use EMU'
29 |
30 | device_1 = local_rank
31 | device_2 = local_rank + device_num // 2
32 |
33 | torch.cuda.set_device(device_1)
34 | torch.cuda.set_device(device_2)
35 |
36 | tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat"
37 | self.tokenizer = tokenizer
38 | with init_empty_weights():
39 | model = AutoModelForCausalLM.from_pretrained(
40 | model_path, # "BAAI/Emu2-Chat"
41 | torch_dtype=torch.bfloat16,
42 | low_cpu_mem_usage=True,
43 | trust_remote_code=True)
44 |
45 | device_map = infer_auto_device_map(
46 | model,
47 | max_memory={
48 | device_1: '70GiB',
49 | device_2: '70GiB'
50 | },
51 | no_split_module_classes=['Block', 'LlamaDecoderLayer'])
52 |
53 | # input and output logits should be on same device
54 | device_map['model.decoder.lm.lm_head'] = device_1
55 |
56 | model = dispatch_model(
57 | model,
58 | device_map=device_map).eval()
59 |
60 | self.model = model
61 | kwargs_default = dict(max_new_tokens=512, length_penalty=-1)
62 | kwargs_default.update(kwargs)
63 | self.kwargs = kwargs_default
64 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
65 |
66 | def generate_inner(self, message, dataset=None):
67 | query, images = '', []
68 | for item in message:
69 | if item['type'] == 'image':
70 | images.append(Image.open(item['value']).convert('RGB'))
71 | query += '[]'
72 | elif item['type'] == 'text':
73 | query += item['value']
74 |
75 | inputs = self.model.build_input_ids(
76 | text=[query],
77 | tokenizer=self.tokenizer,
78 | image=images
79 | )
80 |
81 | with torch.no_grad():
82 | outputs = self.model.generate(
83 | input_ids=inputs['input_ids'],
84 | attention_mask=inputs['attention_mask'],
85 | image=inputs['image'].to(torch.bfloat16),
86 | **self.kwargs)
87 |
88 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
89 | return output_text[0]
90 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | import os.path as osp
4 | import sys
5 | from .base import BaseModel
6 | from ..smp import *
7 |
8 |
9 | class InstructBLIP(BaseModel):
10 |
11 | INSTALL_REQ = True
12 | INTERLEAVE = False
13 |
14 | def __init__(self, name):
15 | self.config_map = {
16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 | }
19 |
20 | self.file_path = __file__
21 | config_root = osp.dirname(self.file_path)
22 |
23 | try:
24 | from lavis.models import load_preprocess
25 | from omegaconf import OmegaConf
26 | from lavis.common.registry import registry
27 | except:
28 | warnings.warn('Please install lavis before using InstructBLIP. ')
29 | sys.exit(-1)
30 |
31 | assert name in self.config_map
32 | cfg_path = osp.join(config_root, self.config_map[name])
33 | cfg = OmegaConf.load(cfg_path)
34 |
35 | model_cfg = cfg.model
36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 | model = model_cls.from_config(model_cfg)
39 | model.eval()
40 |
41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 | device = self.device
43 | model.to(device)
44 | self.model = model
45 | self.kwargs = {'max_length': 512}
46 |
47 | preprocess_cfg = cfg.preprocess
48 | vis_processors, _ = load_preprocess(preprocess_cfg)
49 | self.vis_processors = vis_processors
50 |
51 | def generate_inner(self, message, dataset=None):
52 | prompt, image_path = self.message_to_promptimg(message)
53 | vis_processors = self.vis_processors
54 | raw_image = Image.open(image_path).convert('RGB')
55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 | return outputs[0]
58 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next
2 | from .llava_xtuner import LLaVA_XTuner
3 |
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner']
5 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/minigpt4.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | import os.path as osp
4 | import warnings
5 | from transformers import StoppingCriteriaList
6 | from .base import BaseModel
7 |
8 |
9 | class MiniGPT4(BaseModel):
10 |
11 | INSTALL_REQ = True
12 | INTERLEAVE = False
13 |
14 | def __init__(self,
15 | mode='v2',
16 | root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
17 | temperature=1,
18 | max_out_len=512):
19 |
20 | if root is None:
21 | warnings.warn(
22 | 'Please set root to the directory of MiniGPT-4, which is cloned from here: '
23 | 'https://github.com/Vision-CAIR/MiniGPT-4. '
24 | )
25 |
26 | if mode == 'v2':
27 | cfg = 'minigptv2_eval.yaml'
28 | elif mode == 'v1_7b':
29 | cfg = 'minigpt4_7b_eval.yaml'
30 | elif mode == 'v1_13b':
31 | cfg = 'minigpt4_13b_eval.yaml'
32 | else:
33 | raise NotImplementedError
34 |
35 | self.mode = mode
36 | self.temperature = temperature
37 | self.max_out_len = max_out_len
38 | self.root = root
39 | this_dir = osp.dirname(__file__)
40 |
41 | self.cfg = osp.join(this_dir, 'misc', cfg)
42 | sys.path.append(self.root)
43 |
44 | from omegaconf import OmegaConf
45 | from minigpt4.common.registry import registry
46 | from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
47 |
48 | device = torch.cuda.current_device()
49 | self.device = device
50 |
51 | cfg_path = self.cfg
52 | cfg = OmegaConf.load(cfg_path)
53 |
54 | model_cfg = cfg.model
55 | model_cfg.device_8bit = device
56 | model_cls = registry.get_model_class(model_cfg.arch)
57 | model = model_cls.from_config(model_cfg)
58 | model = model.to(device)
59 | model.eval()
60 | vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
61 | vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
62 | self.model = model
63 | self.vis_processor = vis_processor
64 |
65 | self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
66 | stop_words_ids = [[835], [2277, 29937]]
67 | stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
68 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
69 |
70 | def generate_inner(self, message, dataset=None):
71 | from minigpt4.conversation.conversation import Chat
72 | prompt, image_path = self.message_to_promptimg(message)
73 | if self.mode == 'v2':
74 | chat = Chat(self.model, self.vis_processor, device=self.device)
75 | else:
76 | chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
77 |
78 | chat_state = self.CONV_VISION.copy()
79 | img_list = []
80 | _ = chat.upload_img(image_path, chat_state, img_list)
81 | chat.encode_img(img_list)
82 | chat.ask(prompt, chat_state)
83 | with torch.inference_mode():
84 | msg = chat.answer(conv=chat_state, img_list=img_list)[0]
85 | return msg
86 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna13b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # path to Vicuna checkpoint
25 | llm_model: "Please set the path to your vicuna-13b-v1.1"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip2_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna7b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # path to Vicuna checkpoint
25 | llm_model: "Please set the path to your vicuna-7b-v1.1"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip2_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt4
3 | model_type: pretrain_vicuna_7b
4 | max_txt_len: 160
5 | end_sym: "###"
6 | low_resource: True
7 | prompt_template: '###Human: {} ###Assistant: '
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 |
10 | # vit encoder
11 | image_size: 224
12 | drop_path_rate: 0
13 | use_grad_checkpoint: False
14 | vit_precision: "fp16"
15 | freeze_vit: True
16 | freeze_qformer: True
17 |
18 | # Q-Former
19 | num_query_token: 32
20 |
21 | # generation configs
22 | prompt: ""
23 |
24 | llama_model: "please set this value to the path of vicuna-13b-v0"
25 |
26 | datasets:
27 | cc_sbu_align:
28 | vis_processor:
29 | train:
30 | name: "blip2_image_eval"
31 | image_size: 224
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 |
36 | run:
37 | task: image_text_pretrain
38 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt4
3 | model_type: pretrain_vicuna_7b
4 | max_txt_len: 160
5 | end_sym: "###"
6 | low_resource: True
7 | prompt_template: '###Human: {} ###Assistant: '
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 |
10 | # vit encoder
11 | image_size: 224
12 | drop_path_rate: 0
13 | use_grad_checkpoint: False
14 | vit_precision: "fp16"
15 | freeze_vit: True
16 | freeze_qformer: True
17 |
18 | # Q-Former
19 | num_query_token: 32
20 |
21 | # generation configs
22 | prompt: ""
23 |
24 | llama_model: "please set this value to the path of vicuna-7b-v0"
25 |
26 |
27 | datasets:
28 | cc_sbu_align:
29 | vis_processor:
30 | train:
31 | name: "blip2_image_eval"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 |
37 | run:
38 | task: image_text_pretrain
39 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt_v2
3 | model_type: pretrain
4 | max_txt_len: 160
5 | end_sym: ""
6 | low_resource: True
7 | prompt_template: '[INST] {} [/INST]'
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 | lora_r: 64
10 | lora_alpha: 16
11 |
12 | # vit encoder
13 | image_size: 448
14 | drop_path_rate: 0
15 | use_grad_checkpoint: False
16 | vit_precision: "fp16"
17 | freeze_vit: True
18 |
19 | # generation configs
20 | prompt: ""
21 |
22 | # LLM
23 | llama_model: "please set this value to the path of llama2-chat-7b"
24 |
25 | datasets:
26 | cc_sbu_align:
27 | vis_processor:
28 | train:
29 | name: "blip2_image_eval"
30 | image_size: 448
31 | text_processor:
32 | train:
33 | name: "blip_caption"
34 |
35 | run:
36 | task: image_text_pretrain
37 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/mmalaya.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | import warnings
4 | from PIL import Image
5 | from .base import BaseModel
6 |
7 |
8 | class MMAlaya(BaseModel):
9 |
10 | INSTALL_REQ = False
11 | INTERLEAVE = False
12 |
13 | def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
14 | assert model_path is not None
15 | self.model_path = model_path
16 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
17 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
18 | # need initialize tokenizer
19 | model.initialize_tokenizer(self.tokenizer)
20 | self.model = model.cuda()
21 |
22 | self.kwargs = kwargs
23 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
24 | torch.cuda.empty_cache()
25 |
26 | def generate_inner(self, message, dataset=None):
27 | # read image
28 | prompt, image_path = self.message_to_promptimg(message)
29 | image = Image.open(image_path).convert('RGB')
30 | # tokenize prompt, and proprecess image
31 | input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
32 | prompt,
33 | self.tokenizer,
34 | image,
35 | return_tensors='pt')
36 | with torch.inference_mode():
37 | output_ids = self.model.generate(
38 | inputs=input_ids.cuda(),
39 | images=image_tensor.cuda(),
40 | do_sample=False,
41 | max_new_tokens=512,
42 | num_beams=1,
43 | use_cache=True,
44 | stopping_criteria=[stopping_criteria],
45 | )
46 | # truncate input_ids in generate_ids and then decode to text
47 | input_token_len = input_ids.shape[1]
48 | response = self.tokenizer.batch_decode(
49 | output_ids[:, input_token_len:].cpu(),
50 | skip_special_tokens=True,
51 | clean_up_tokenization_spaces=False
52 | )[0].strip()
53 | return response
54 |
55 |
56 | if __name__ == '__main__':
57 | model = MMAlaya()
58 | response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。'])
59 | print(response)
60 |
61 | """
62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit
63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py
64 | """
65 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/mplug_owl2.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | from PIL import Image
4 | from .base import BaseModel
5 | from ..smp import *
6 | from ..utils import DATASET_TYPE
7 |
8 |
9 | class mPLUG_Owl2(BaseModel):
10 |
11 | INSTALL_REQ = True
12 | INTERLEAVE = False
13 |
14 | def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
15 | try:
16 | from mplug_owl2.model.builder import load_pretrained_model
17 | from mplug_owl2.mm_utils import get_model_name_from_path
18 | except:
19 | warnings.warn('Please install mPLUG_Owl2 before using mPLUG_Owl2. ')
20 | sys.exit(-1)
21 |
22 | model_name = get_model_name_from_path(model_path)
23 | tokenizer, model, image_processor, context_len = load_pretrained_model(
24 | model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu')
25 |
26 | self.model = model.cuda()
27 | self.device = self.model.device
28 | self.image_processor = image_processor
29 | tokenizer.padding_side = 'left'
30 | tokenizer.pad_token_id = tokenizer.eos_token_id
31 | self.tokenizer = tokenizer
32 | self.context_len = context_len
33 |
34 | kwargs_default = dict(
35 | max_new_tokens=512, do_sample=False, num_beams=1,
36 | min_new_tokens=1, length_penalty=1, num_return_sequences=1)
37 | kwargs_default.update(kwargs)
38 | self.kwargs = kwargs_default
39 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
40 |
41 | def use_custom_prompt(self, dataset):
42 | assert dataset is not None
43 | if listinstr(['MMMU'], dataset):
44 | return False
45 | if DATASET_TYPE(dataset) == 'multi-choice' or dataset == 'MMVet':
46 | return True
47 | return False
48 |
49 | def build_prompt(self, line, dataset=None):
50 | assert dataset is None or isinstance(dataset, str)
51 | assert self.use_custom_prompt(dataset)
52 | tgt_path = self.dump_image(line, dataset)
53 | question = line['question']
54 | if dataset == 'MMVet':
55 | prompt = question + '\nAnswer the question directly. '
56 | elif DATASET_TYPE(dataset) == 'multi-choice':
57 | options = {
58 | cand: line[cand]
59 | for cand in string.ascii_uppercase
60 | if cand in line and not pd.isna(line[cand])
61 | }
62 | options_prompt = ''
63 | for key, item in options.items():
64 | options_prompt += f'{key}. {item}\n'
65 |
66 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
67 | prompt = f'Hint: {hint}\n' if hint is not None else ''
68 | prompt += f'{question}\n'
69 | prompt += (
70 | f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
71 | if len(options) else 'Answer the question directly. '
72 | )
73 | else:
74 | raise NotImplementedError
75 |
76 | message = [dict(type='text', value=prompt)]
77 | message.extend([dict(type='image', value=s) for s in tgt_path])
78 | return message
79 |
80 | def generate_inner(self, message, dataset=None):
81 | from mplug_owl2.constants import IMAGE_TOKEN_INDEX
82 | from mplug_owl2.mm_utils import process_images, tokenizer_image_token
83 | kwargs = cp.deepcopy(self.kwargs)
84 | if dataset in ['MMVet', 'LLaVABench']:
85 | kwargs['length_penalty'] = 0
86 | elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
87 | kwargs['length_penalty'] = 0
88 | elif dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
89 | kwargs['max_new_tokens'] = 10
90 | num_images = len([x for x in message if x['type'] == 'image'])
91 | assert num_images >= 0
92 | prompt_full = 'USER: '
93 | images = []
94 | if num_images == 1:
95 | prompt, image = self.message_to_promptimg(message)
96 | prompt_full += f'<|image|>{prompt} \nASSISTANT: '
97 | images.append(image)
98 | else:
99 | for msg in message:
100 | if msg['type'] == 'image':
101 | images.append(msg['value'])
102 | prompt_full += '<|image|>'
103 | elif msg['type'] == 'text':
104 | prompt_full += msg['value']
105 | prompt_full += '\nASSISTANT: '
106 |
107 | def preproc_image(fname):
108 | image = Image.open(fname).convert('RGB')
109 | max_edge = max(image.size)
110 | image = image.resize((max_edge, max_edge))
111 | return image
112 | images = [preproc_image(fname) for fname in images]
113 | image_tensor = process_images(images, self.image_processor)
114 | image_tensor = image_tensor.to(self.device, dtype=torch.float16)
115 | input_ids = tokenizer_image_token(
116 | prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
117 |
118 | with torch.inference_mode():
119 | output_ids = self.model.generate(
120 | input_ids=input_ids,
121 | images=image_tensor,
122 | output_hidden_states=True,
123 | use_cache=True,
124 | **kwargs)
125 | answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
126 | return answer.split('')[0]
127 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/omnilmm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | from PIL import Image
4 | from transformers import AutoTokenizer
5 |
6 | from .base import BaseModel
7 | from ..smp import *
8 | from ..utils import DATASET_TYPE
9 |
10 |
11 | DEFAULT_IMAGE_TOKEN = ''
12 | DEFAULT_IMAGE_PATCH_TOKEN = ''
13 | DEFAULT_IM_START_TOKEN = ''
14 | DEFAULT_IM_END_TOKEN = ''
15 |
16 |
17 | def init_omni_lmm(model_path):
18 | from omnilmm.model.omnilmm import OmniLMMForCausalLM
19 | from omnilmm.utils import disable_torch_init
20 | from omnilmm.model.utils import build_transform
21 |
22 | torch.backends.cuda.matmul.allow_tf32 = True
23 | disable_torch_init()
24 | tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
25 |
26 | model = OmniLMMForCausalLM.from_pretrained(model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu')
27 | model = model.to(device='cuda', dtype=torch.bfloat16)
28 |
29 | image_processor = build_transform(is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
30 |
31 | mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
32 | assert mm_use_im_start_end
33 |
34 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
35 |
36 | vision_config = model.model.vision_config
37 | vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
38 | [DEFAULT_IMAGE_PATCH_TOKEN])[0]
39 | vision_config.use_im_start_end = mm_use_im_start_end
40 | vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
41 | [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
42 | image_token_len = model.model.config.num_query
43 |
44 | return model, image_processor, image_token_len, tokenizer
45 |
46 |
47 | def expand_question_into_multimodal(question_text, image_token_len, im_st_token, im_ed_token, im_patch_token):
48 | if '' in question_text[0]['content']:
49 | question_text[0]['content'] = question_text[0]['content'].replace(
50 | '', im_st_token + im_patch_token * image_token_len + im_ed_token)
51 | else:
52 | question_text[0]['content'] = im_st_token + im_patch_token * \
53 | image_token_len + im_ed_token + '\n' + question_text[0]['content']
54 | return question_text
55 |
56 |
57 | def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
58 | from omnilmm.train.train_utils import omni_preprocess
59 | question = expand_question_into_multimodal(
60 | question, image_token_len, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN)
61 |
62 | conversation = question
63 | data_dict = omni_preprocess(sources=[conversation], tokenizer=tokenizer, generation=True)
64 |
65 | data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0])
66 | return data_dict
67 |
68 |
69 | class OmniLMM12B(BaseModel):
70 |
71 | INSTALL_REQ = True
72 | INTERLEAVE = False
73 |
74 | def __init__(self, model_path, root, **kwargs) -> None:
75 | sys.path.append(root)
76 | model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
77 | self.model = model
78 | self.image_token_len = image_token_len
79 | self.image_transform = img_processor
80 | self.tokenizer = tokenizer
81 | self.model.eval()
82 | default_kwargs = dict(
83 | max_new_tokens=512,
84 | do_sample=False,
85 | output_scores=True,
86 | return_dict_in_generate=True,
87 | repetition_penalty=1.1)
88 | default_kwargs.update(kwargs)
89 | self.kwargs = default_kwargs
90 | torch.cuda.empty_cache()
91 |
92 | def generate_inner(self, message, dataset=None):
93 | prompt, image_path = self.message_to_promptimg(message)
94 | try:
95 | image = Image.open(image_path).convert('RGB')
96 | except:
97 | logger = get_logger('OmniLMM Inference')
98 | logger.error('Image Decode Error')
99 | return 'Image Decode Error'
100 |
101 | msgs = [dict(role='user', content=prompt)]
102 | input_ids = wrap_question_for_omni_lmm(
103 | msgs, self.image_token_len, self.tokenizer)['input_ids']
104 | input_ids = torch.as_tensor(input_ids)
105 | image = self.image_transform(image)
106 |
107 | with torch.inference_mode():
108 | output = self.model.generate_vllm(
109 | input_ids=input_ids.unsqueeze(0).cuda(),
110 | images=image.unsqueeze(0).half().cuda(),
111 | **self.kwargs)
112 |
113 | response = self.tokenizer.decode(
114 | output.sequences[0], skip_special_tokens=True)
115 | response = response.strip()
116 | return response
117 |
118 | def use_custom_prompt(self, dataset):
119 | assert dataset is not None
120 | if DATASET_TYPE(dataset) == 'multi-choice':
121 | return True
122 | return False
123 |
124 | def build_prompt(self, line, dataset=None):
125 | assert dataset is None or isinstance(dataset, str)
126 | assert self.use_custom_prompt(dataset)
127 | tgt_path = self.dump_image(line, dataset)
128 |
129 | question = line['question']
130 | options = {
131 | cand: line[cand]
132 | for cand in string.ascii_uppercase
133 | if cand in line and not pd.isna(line[cand])
134 | }
135 | options_prompt = 'Options:\n'
136 | for key, item in options.items():
137 | options_prompt += f'{key}. {item}\n'
138 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
139 | prompt = ''
140 | if hint is not None:
141 | prompt += f'Hint: {hint}\n'
142 | prompt += f'{question}\n'
143 | if len(options):
144 | prompt += options_prompt
145 | prompt = """
146 | Study the image carefully and pick the option associated with the correct answer.
147 | Focus solely on selecting the option and avoid including any other content.\n
148 | """ + prompt
149 |
150 | message = [dict(type='text', value=prompt)]
151 | message.extend([dict(type='image', value=s) for s in tgt_path])
152 | return message
153 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/open_flamingo.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | from PIL import Image
4 | import os.path as osp
5 | import warnings
6 | from .base import BaseModel
7 | from ..smp import splitlen, get_cache_path
8 | from huggingface_hub import snapshot_download
9 |
10 |
11 | class OpenFlamingo(BaseModel):
12 |
13 | INSTALL_REQ = True
14 | INTERLEAVE = True
15 |
16 | def __init__(self,
17 | name,
18 | mpt_pth=None,
19 | ckpt_pth=None,
20 | **kwargs):
21 |
22 | if mpt_pth is None:
23 | warnings.warn(
24 | 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
25 | 'https://huggingface.co/mosaicml/mpt-7b. '
26 | )
27 | sys.exit(-1)
28 | if ckpt_pth is None:
29 | warnings.warn(
30 | 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
31 | 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
32 | )
33 | sys.exit(-1)
34 | else:
35 | if osp.exists(ckpt_pth):
36 | if ckpt_pth.endswith('checkpoint.pt'):
37 | pass
38 | elif osp.isdir(ckpt_pth):
39 | ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
40 | if not osp.exists(ckpt_pth):
41 | sys.exit(-1)
42 | elif splitlen(ckpt_pth, '/') == 2:
43 | cache_path = get_cache_path(ckpt_pth)
44 | if cache_path is None:
45 | snapshot_download(ckpt_pth)
46 | cache_path = get_cache_path(ckpt_pth)
47 | if cache_path is None:
48 | sys.exit(-1)
49 | else:
50 | ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
51 |
52 | self.name = name
53 | assert name in ['v2']
54 | self.mpt_pth = mpt_pth
55 | try:
56 | from open_flamingo import create_model_and_transforms
57 | except:
58 | raise ImportError('Please first install open_flamingo to use OpenFlamingo')
59 | model, image_processor, tokenizer = create_model_and_transforms(
60 | clip_vision_encoder_path='ViT-L-14',
61 | clip_vision_encoder_pretrained='openai',
62 | lang_encoder_path=mpt_pth,
63 | tokenizer_path=mpt_pth,
64 | cross_attn_every_n_layers=4)
65 | ckpt = torch.load(ckpt_pth)
66 | model.load_state_dict(ckpt, strict=False)
67 | torch.cuda.empty_cache()
68 | self.model = model.eval().cuda()
69 | self.tokenizer = tokenizer
70 | self.tokenizer.padding_side = 'left'
71 | self.image_proc = image_processor
72 |
73 | kwargs_default = dict(max_new_tokens=512, num_beams=3)
74 | kwargs_default.update(kwargs)
75 | self.kwargs = kwargs_default
76 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
77 |
78 | def generate_inner(self, message, dataset=None):
79 | vision_x = []
80 | prompt = ''
81 | for msg in message:
82 | if msg['type'] == 'image':
83 | img = Image.open(msg['value'])
84 | vision_x.append(self.image_proc(img).unsqueeze(0))
85 | prompt += ''
86 | elif msg['type'] == 'text':
87 | prompt += msg['value']
88 | prompt += 'Answer: '
89 | vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
90 | vision_x = vision_x.unsqueeze(1).unsqueeze(0)
91 | lang_x = self.tokenizer([prompt], return_tensors='pt')
92 | generated_text = self.model.generate(
93 | vision_x=vision_x.cuda(),
94 | lang_x=lang_x['input_ids'].cuda(),
95 | attention_mask=lang_x['attention_mask'].cuda(),
96 | **self.kwargs)
97 | generated_text = self.tokenizer.decode(generated_text[0])
98 | text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
99 | return text
100 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/paligemma.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import torch
3 |
4 | from .base import BaseModel
5 | from ..smp import *
6 |
7 |
8 | class PaliGemma(BaseModel):
9 | INSTALL_REQ = False
10 | INTERLEAVE = False
11 |
12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
13 | try:
14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
15 | except:
16 | warnings.warn('Please install the latest version transformers.')
17 | sys.exit(-1)
18 | model = PaliGemmaForConditionalGeneration.from_pretrained(
19 | model_path,
20 | torch_dtype=torch.bfloat16,
21 | device_map='cpu',
22 | revision='bfloat16',
23 | ).eval()
24 | self.model = model.cuda()
25 | self.processor = AutoProcessor.from_pretrained(model_path)
26 | self.kwargs = kwargs
27 |
28 | def generate_inner(self, message, dataset=None):
29 | prompt, image_path = self.message_to_promptimg(message)
30 | image = Image.open(image_path).convert('RGB')
31 |
32 | model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
33 | input_len = model_inputs['input_ids'].shape[-1]
34 |
35 | with torch.inference_mode():
36 | generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
37 | generation = generation[0][input_len:]
38 | res = self.processor.decode(generation, skip_special_tokens=True)
39 | return res
40 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/pandagpt.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import torch
3 | import os.path as osp
4 | import warnings
5 | from .base import BaseModel
6 |
7 |
8 | class PandaGPT(BaseModel):
9 |
10 | INSTALL_REQ = True
11 | INTERLEAVE = False
12 |
13 | def __init__(self, name, root=None, **kwargs):
14 | if root is None:
15 | warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ')
16 | sys.exit(-1)
17 |
18 | assert name == 'PandaGPT_13B'
19 | self.name = name
20 | sys.path.append(osp.join(root, 'code'))
21 | try:
22 | from model.openllama import OpenLLAMAPEFTModel
23 | except:
24 | raise ImportError(
25 | 'Please first install PandaGPT and set the root path to use PandaGPT, '
26 | 'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
27 | )
28 | self.args = {
29 | 'model': 'openllama_peft',
30 | 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
31 | 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
32 | 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
33 | 'stage': 2,
34 | 'max_tgt_len': 512,
35 | 'lora_r': 32,
36 | 'lora_alpha': 32,
37 | 'lora_dropout': 0.1,
38 | }
39 | model = OpenLLAMAPEFTModel(**self.args)
40 | delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
41 | model.load_state_dict(delta_ckpt, strict=False)
42 | torch.cuda.empty_cache()
43 | self.model = model.eval().half().cuda()
44 | kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
45 | kwargs_default.update(kwargs)
46 | self.kwargs = kwargs_default
47 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
48 |
49 | def generate_inner(self, message, dataset=None):
50 | prompt, image_path = self.message_to_promptimg(message)
51 | struct = {
52 | 'prompt': prompt,
53 | 'image_paths': [image_path],
54 | 'audio_paths': [],
55 | 'video_paths': [],
56 | 'thermal_paths': [],
57 | 'modality_embeds': []
58 | }
59 | struct.update(self.kwargs)
60 | resp = self.model.generate(struct)
61 | return resp
62 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/phi3_vision.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import torch
3 |
4 | from .base import BaseModel
5 | from ..smp import *
6 |
7 |
8 | class Phi3Vision(BaseModel):
9 |
10 | INSTALL_REQ = False
11 | INTERLEAVE = False
12 |
13 | def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
14 | try:
15 | from transformers import AutoProcessor, AutoModelForCausalLM
16 | except:
17 | warnings.warn('Please install the latest version transformers.')
18 | sys.exit(-1)
19 | model = AutoModelForCausalLM.from_pretrained(
20 | model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
21 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
22 | self.model = model
23 | self.processor = processor
24 | self.kwargs = kwargs
25 |
26 | def generate_inner(self, message, dataset=None):
27 | prompt, image_path = self.message_to_promptimg(message)
28 | image = Image.open(image_path).convert('RGB')
29 | messages = [
30 | {'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
31 | ]
32 | prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33 | inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
34 |
35 | generation_args = {
36 | 'max_new_tokens': 500,
37 | 'temperature': 0.0,
38 | 'do_sample': False,
39 | }
40 | generation_args.update(self.kwargs)
41 |
42 | generate_ids = self.model.generate(
43 | **inputs,
44 | eos_token_id=self.processor.tokenizer.eos_token_id,
45 | **generation_args
46 | )
47 | generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
48 | response = self.processor.batch_decode(
49 | generate_ids,
50 | skip_special_tokens=True,
51 | clean_up_tokenization_spaces=False
52 | )[0]
53 | return response
54 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/qh_360vl.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | import warnings
4 | import os.path as osp
5 | from PIL import Image
6 | from .base import BaseModel
7 | from ..smp import *
8 | from ..utils import DATASET_TYPE
9 |
10 |
11 | class QH_360VL(BaseModel):
12 |
13 | INSTALL_REQ = False
14 | INTERLEAVE = False
15 |
16 | def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
17 | assert model_path is not None
18 | self.model_path = model_path
19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20 | self.model = AutoModelForCausalLM.from_pretrained(model_path,
21 | torch_dtype=torch.float16,
22 | low_cpu_mem_usage=True,
23 | device_map='auto',
24 | trust_remote_code=True).eval()
25 | vision_tower = self.model.get_vision_tower()
26 | vision_tower.load_model()
27 | vision_tower.to(device='cuda', dtype=torch.float16)
28 | self.image_processor = vision_tower.image_processor
29 | self.tokenizer.pad_token = self.tokenizer.eos_token
30 | self.kwargs = kwargs
31 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
32 | torch.cuda.empty_cache()
33 |
34 | def generate(self, message, dataset=None):
35 |
36 | prompt, image_path = self.message_to_promptimg(message)
37 | print(prompt)
38 | image = Image.open(image_path).convert('RGB')
39 | terminators = [
40 | self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
41 | ]
42 | inputs = self.model.build_conversation_input_ids(self.tokenizer,
43 | query=prompt,
44 | image=image,
45 | image_processor=self.image_processor)
46 | input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
47 | images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
48 |
49 | output_ids = self.model.generate(input_ids=input_ids,
50 | images=images,
51 | do_sample=False,
52 | num_beams=1,
53 | max_new_tokens=512,
54 | eos_token_id=terminators,
55 | use_cache=True)
56 |
57 | input_token_len = input_ids.shape[1]
58 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
59 | response = outputs.strip()
60 |
61 | return response
62 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/qwen_vl.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | import warnings
4 | import copy as cp
5 | from .base import BaseModel
6 | from ..smp import isimg, listinstr
7 | from ..utils import DATASET_TYPE
8 |
9 |
10 | class QwenVL(BaseModel):
11 |
12 | INSTALL_REQ = False
13 | INTERLEAVE = True
14 |
15 | def __init__(self, model_path='Qwen/Qwen-VL', **kwargs):
16 | assert model_path is not None
17 | self.model_path = model_path
18 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
19 | tokenizer.padding_side = 'left'
20 | tokenizer.pad_token_id = tokenizer.eod_id
21 | self.tokenizer = tokenizer
22 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
23 | default_kwargs = dict(
24 | do_sample=False,
25 | num_beams=1,
26 | max_new_tokens=512,
27 | min_new_tokens=1,
28 | num_return_sequences=1,
29 | use_cache=True,
30 | output_hidden_states=True,
31 | pad_token_id=tokenizer.eod_id,
32 | eos_token_id=tokenizer.eod_id)
33 | default_kwargs.update(kwargs)
34 | self.kwargs = default_kwargs
35 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
36 | torch.cuda.empty_cache()
37 |
38 | def adjust_kwargs(self, dataset):
39 | kwargs = cp.deepcopy(self.kwargs)
40 | if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
41 | kwargs['max_new_tokens'] = 32
42 | elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
43 | kwargs['max_new_tokens'] = 32
44 | elif DATASET_TYPE(dataset) == 'VQA':
45 | if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
46 | kwargs['max_new_tokens'] = 100
47 | elif listinstr(['TextVQA'], dataset):
48 | kwargs['max_new_tokens'] = 10
49 | return kwargs
50 |
51 | def generate_inner(self, message, dataset=None):
52 | if dataset is not None:
53 | kwargs = self.adjust_kwargs(dataset)
54 | else:
55 | kwargs = self.kwargs
56 | prompt = ''
57 | for s in message:
58 | if s['type'] == 'image':
59 | prompt += f'
{s["value"]}'
60 | elif s['type'] == 'text':
61 | prompt += s['value']
62 | if dataset is not None and DATASET_TYPE(dataset) == 'VQA':
63 | prompt += ' Answer:'
64 | encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest')
65 | input_ids = encoded.input_ids.to('cuda')
66 | attention_mask = encoded.attention_mask.to('cuda')
67 |
68 | pred = self.model.generate(
69 | input_ids=input_ids,
70 | attention_mask=attention_mask,
71 | **kwargs)
72 | answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
73 | return answer
74 |
75 |
76 | class QwenVLChat(BaseModel):
77 |
78 | INSTALL_REQ = False
79 | INTERLEAVE = True
80 |
81 | def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs):
82 | assert model_path is not None
83 | self.model_path = model_path
84 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
85 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
86 | torch.cuda.empty_cache()
87 | self.kwargs = kwargs
88 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
89 |
90 | def generate_inner(self, message, dataset=None):
91 | vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message]
92 | query = self.tokenizer.from_list_format(vl_list)
93 | response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs)
94 | return response
95 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from .base import BaseModel
3 | from ..smp import *
4 |
5 |
6 | class VisualGLM(BaseModel):
7 |
8 | INSTALL_REQ = False
9 | INTERLEAVE = False
10 |
11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 | try:
13 | import sat
14 | except:
15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
16 | assert model_path is not None
17 | self.model_path = model_path
18 |
19 | from transformers import AutoModel
20 | from transformers import AutoTokenizer
21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
23 | self.model = model
24 | self.kwargs = kwargs
25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
26 |
27 | def generate_inner(self, message, dataset=None):
28 | prompt, image_path = self.message_to_promptimg(message)
29 | output, _ = self.model.chat(
30 | image_path=image_path,
31 | tokenizer=self.tokenizer,
32 | query=prompt,
33 | history=[],
34 | **self.kwargs
35 | )
36 | return output
37 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/vxverse.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | import os.path as osp
4 | import warnings
5 | from .base import BaseModel
6 | from transformers import StoppingCriteriaList
7 | from omegaconf import OmegaConf
8 | from PIL import Image
9 | from huggingface_hub import snapshot_download
10 | from vlmeval.smp import *
11 |
12 | model_cfgs = {
13 | 'XVERSE-V-13B': {
14 | 'arch': 'vxverse',
15 | 'model_type': 'pretrain_xverse13b-chat',
16 | 'max_txt_len': 512,
17 | 'end_sym': '<|endoftext|>',
18 | 'low_resource': False,
19 | 'prompt_template': 'Human: {}\nAssistant: ',
20 | 'ckpt': 'xverse/XVERSE-V-13B',
21 | 'lora_r': 128,
22 | 'lora_alpha': 256,
23 | 'lora_dropout': 0.05,
24 | 'lora_target_modules': 'all_linear',
25 | 'has_qformer': False,
26 | 'n_proj_layers': 2,
27 | 'vit_model': 'openai/clip-vit-large-patch14',
28 | 'vit_path': 'openai/clip-vit-large-patch14',
29 | 'image_size': 224,
30 | 'drop_path_rate': 0,
31 | 'vit_precision': 'fp16',
32 | 'llama_model': 'xverse/XVERSE-13B-Chat',
33 | }
34 | }
35 |
36 |
37 | class VXVERSE(BaseModel):
38 |
39 | INSTALL_REQ = True
40 | INTERLEAVE = False
41 |
42 | def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs):
43 |
44 | if root is None:
45 | warnings.warn('Please set root to the directory of vxverse.')
46 |
47 | if model_name == 'XVERSE-V-13B':
48 | cfg = model_cfgs['XVERSE-V-13B']
49 | else:
50 | raise NotImplementedError
51 |
52 | ckpt_dir = cfg['ckpt']
53 | if not osp.isdir(ckpt_dir):
54 | cache_path = get_cache_path(ckpt_dir)
55 | if cache_path is not None:
56 | ckpt_dir = cache_path
57 | else:
58 | ckpt_dir = snapshot_download(repo_id=ckpt_dir)
59 | assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir)
60 | ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin')
61 | cfg['ckpt'] = ckpt
62 | model_cfg = OmegaConf.create(cfg)
63 |
64 | self.model_name = model_name
65 |
66 | self.root = root
67 | sys.path.append(self.root)
68 |
69 | from vxverse.common.registry import registry
70 | from vxverse.conversation.conversation import CONV_VISION_XVERSE
71 |
72 | device = torch.cuda.current_device()
73 | self.device = device
74 |
75 | model_cls = registry.get_model_class(model_cfg.arch)
76 | model = model_cls.from_config(model_cfg)
77 | model = model.to(device)
78 | model.eval()
79 | vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224))
80 | vis_processor = registry.get_processor_class(
81 | vis_processor_cfg.name
82 | ).from_config(vis_processor_cfg)
83 |
84 | self.model = model
85 | self.vis_processor = vis_processor
86 | self.vis_processor_cfg = vis_processor_cfg
87 |
88 | self.CONV_VISION = CONV_VISION_XVERSE
89 | self.CONV_VISION.system = ''
90 | stop_words_ids = [[835], [2277, 29937]]
91 | self.stop_words_ids = stop_words_ids
92 | default_kwargs = dict(max_new_tokens=512)
93 | default_kwargs.update(kwargs)
94 | self.kwargs = default_kwargs
95 |
96 | def generate_inner(self, message, dataset=None):
97 | prompt, image_path = self.message_to_promptimg(message)
98 |
99 | image = Image.open(image_path).convert('RGB')
100 | image = self.vis_processor(image)
101 |
102 | if self.vis_processor_cfg.name == 'hd_image_train':
103 | patches_per_image = [[image.shape[0]]]
104 | image = [image]
105 | else:
106 | patches_per_image = None
107 | image = image.unsqueeze(0)
108 |
109 | chat_state = self.CONV_VISION.copy()
110 | texts = self.prepare_texts([prompt], chat_state)
111 | texts = [text.lstrip() for text in texts]
112 | answers = self.model.generate(
113 | image,
114 | texts,
115 | patches_per_images=patches_per_image,
116 | do_sample=False,
117 | stop_words_ids=self.stop_words_ids,
118 | **self.kwargs
119 | )
120 | return answers[0]
121 |
122 | def prepare_texts(self, texts, conv_temp):
123 | convs = [conv_temp.copy() for _ in range(len(texts))]
124 | [
125 | conv.append_message(conv.roles[0], '\n{}'.format(text))
126 | for conv, text in zip(convs, texts)
127 | ]
128 | [conv.append_message(conv.roles[1], None) for conv in convs]
129 | texts = [conv.get_prompt() for conv in convs]
130 | return texts
131 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/wemm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | import sys
4 | from ..smp import *
5 | from .base import BaseModel
6 | from ..utils import DATASET_TYPE
7 | from transformers import AutoModel, GenerationConfig
8 |
9 |
10 | class WeMM(BaseModel):
11 | def __init__(self, model_path='feipengma/WeMM', **kwargs):
12 | self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
13 | self.wemm.cuda()
14 | self.wemm.eval()
15 | torch.cuda.empty_cache()
16 |
17 | def use_custom_prompt(self, dataset):
18 | assert dataset is not None
19 | if DATASET_TYPE(dataset) == 'multi-choice':
20 | return True
21 | return False
22 |
23 | def build_prompt(self, line, dataset=None):
24 | assert self.use_custom_prompt(dataset)
25 | assert dataset is None or isinstance(dataset, str)
26 | tgt_path = self.dump_image(line, dataset)
27 | question = line['question']
28 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
29 | if hint is not None:
30 | question = hint + '\n' + question
31 | options = {
32 | cand: line[cand]
33 | for cand in string.ascii_uppercase
34 | if cand in line and not pd.isna(line[cand])
35 | }
36 | for key, item in options.items():
37 | question += f'\n{key}. {item}'
38 | prompt = question
39 |
40 | if len(options):
41 | prompt += (
42 | '\n请直接回答选项字母。' if cn_string(prompt) else
43 | "\nAnswer with the option's letter from the given choices directly."
44 | )
45 | else:
46 | prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
47 |
48 | message = [dict(type='text', value=prompt)]
49 | message.extend([dict(type='image', value=p) for p in tgt_path])
50 | return message
51 |
52 | def generate_inner(self, message, dataset=None):
53 | prompt, image_path = self.message_to_promptimg(message)
54 |
55 | if dataset == 'HallusionBench':
56 | prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
57 |
58 | gen_config = None
59 | if dataset == 'MMVet':
60 | gen_config = GenerationConfig(
61 | max_new_tokens=512,
62 | do_sample=True,
63 | temperatures=0.7,
64 | num_beams=3,
65 | eos_token_id=self.wemm.tokenizer.eos_token_id,
66 | pad_token_id=self.wemm.tokenizer.pad_token_id
67 | if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
68 | )
69 | pred = self.wemm.mm_generate(image_path, prompt, gen_config)
70 |
71 | return pred
72 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 |
6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD']
7 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/sharecaptioner.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/sharecaptioner.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2_4KHD.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2_4KHD.cpython-39.pyc
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer
3 | from ..base import BaseModel
4 | from ...smp import *
5 | from ...utils import DATASET_TYPE
6 |
7 |
8 | class ShareCaptioner(BaseModel):
9 |
10 | INSTALL_REQ = False
11 | INTERLEAVE = False
12 |
13 | def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
14 | assert model_path is not None
15 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
16 | self.model = AutoModelForCausalLM.from_pretrained(
17 | model_path, device_map='cuda', trust_remote_code=True).eval()
18 | self.model.tokenizer = tokenizer
19 | self.model.cuda()
20 | self.model.half()
21 |
22 | def use_custom_prompt(self, dataset):
23 | assert dataset is not None
24 | if DATASET_TYPE(dataset) == 'multi-choice':
25 | return True
26 | return False
27 |
28 | def build_prompt(self, line, dataset=None):
29 | assert dataset is None or isinstance(dataset, str)
30 | assert self.use_custom_prompt(dataset)
31 | tgt_path = self.dump_image(line, dataset)
32 |
33 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
34 | question = line['question']
35 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
36 | if hint is not None:
37 | question = hint + '\n' + question
38 |
39 | option_candidate = string.ascii_uppercase
40 | options = {
41 | cand: line[cand]
42 | for cand in option_candidate
43 | if cand in line and not pd.isna(line[cand])
44 | }
45 | for key, item in options.items():
46 | question += f'\n{key}. {item}'
47 | prompt = question
48 |
49 | if not cn_string(prompt):
50 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
51 | else:
52 | prompt = prompt + '\n' + '请直接回答选项字母。'
53 | else:
54 | prompt = line['question']
55 | message = [dict(type='text', value=prompt)]
56 | message.extend([dict(type='image', value=s) for s in tgt_path])
57 | return message
58 |
59 | def generate_inner(self, message, dataset=None):
60 | prompt, image_path = self.message_to_promptimg(message)
61 | seg1 = '<|User|>:'
62 | seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
63 | self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
64 | self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
65 |
66 | image = Image.open(image_path).convert('RGB')
67 | image = self.model.vis_processor(image).unsqueeze(0)
68 | image = image.to(self.model.device)
69 | tmp_bs = image.shape[0]
70 | tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
71 | tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
72 | with torch.cuda.amp.autocast():
73 | with torch.no_grad():
74 | image = self.model.encode_img(image)
75 | input_emb = torch.cat(
76 | [tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
77 | out_embeds = self.model.internlm_model.generate(
78 | inputs_embeds=input_emb,
79 | max_length=500,
80 | num_beams=3,
81 | min_length=1,
82 | do_sample=True,
83 | repetition_penalty=1.5,
84 | length_penalty=1.0,
85 | temperature=1.,
86 | eos_token_id=self.model.tokenizer.eos_token_id,
87 | num_return_sequences=1)
88 |
89 | for j, out in enumerate(out_embeds):
90 | out[out == -1] = 2
91 | response = self.model.decode_text([out])
92 | return response
93 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModel, AutoTokenizer
3 | from transformers import StoppingCriteria, StoppingCriteriaList
4 | from PIL import Image
5 | from ..base import BaseModel
6 | from ...smp import *
7 |
8 |
9 | class StoppingCriteriaSub(StoppingCriteria):
10 | def __init__(self, stops=[], encounters=1):
11 | super().__init__()
12 | self.stops = stops
13 |
14 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
15 | for stop in self.stops:
16 | if torch.all((stop == input_ids[0][-len(stop):])).item():
17 | return True
18 |
19 | return False
20 |
21 |
22 | from ...utils import DATASET_TYPE
23 |
24 |
25 | class XComposer(BaseModel):
26 |
27 | INSTALL_REQ = False
28 | INTERLEAVE = False
29 |
30 | def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs):
31 | assert model_path is not None
32 | self.model_path = model_path
33 |
34 | model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
35 | tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
36 | model.tokenizer = tokenizer
37 | self.model = model
38 | self.device = self.model.internlm_model.model.embed_tokens.weight.device
39 | self.eoh = ''
40 | self.eoa = ''
41 | stop_words_ids = [
42 | torch.tensor([103027]).to(self.device), # end of human
43 | torch.tensor([103028]).to(self.device), # end of bot
44 | ]
45 | default_kwargs = {
46 | 'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False,
47 | 'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0
48 | }
49 | default_kwargs.update(kwargs)
50 | self.kwargs = default_kwargs
51 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
52 |
53 | def generate_inner(self, message, dataset=None):
54 | if len(message) == 2:
55 | if message[0]['type'] == 'text' and message[1]['type'] == 'image':
56 | message = [message[1], message[0]]
57 | kwargs = cp.deepcopy(self.kwargs)
58 | if dataset is not None:
59 | if DATASET_TYPE(dataset) == 'multi-choice':
60 | kwargs['max_new_tokens'] = 5
61 | kwargs['num_beams'] = 5
62 |
63 | with torch.cuda.amp.autocast():
64 | with torch.no_grad():
65 | prompt_embs = self.message_to_prompt_embs(message, dataset)
66 | outputs = self.model.internlm_model.generate(
67 | inputs_embeds=prompt_embs,
68 | stopping_criteria=self.stopping_criteria,
69 | **kwargs
70 | )
71 |
72 | output_token = outputs[0]
73 | if output_token[0] == 0:
74 | output_token = output_token[1:]
75 | if output_token[0] == 1:
76 | output_token = output_token[1:]
77 | output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
78 |
79 | output_text = output_text.split(self.model.eoa)[0]
80 | output_text = output_text.split('<|Bot|>')[-1].strip()
81 | return output_text
82 |
83 | def message_to_prompt_embs(self, message, dataset=None):
84 | assert isinstance(message, list)
85 | img_embeds = []
86 | prompt_full = '<|User|>: '
87 | for msg in message:
88 | if msg['type'] == 'text':
89 | prompt_full += msg['value']
90 | elif msg['type'] == 'image':
91 | image = Image.open(msg['value']).convert('RGB')
92 | image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
93 | img_embeds.append(self.model.encode_img(image))
94 | prompt_full += ''
95 |
96 | prompt_full += self.model.eoh + ' <|Bot|>: '
97 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
98 | prompt_full += 'Answer: The answer is '
99 | elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']:
100 | prompt_full += 'Answer: '
101 |
102 | prompt_segs = prompt_full.split('')
103 | assert len(prompt_segs) == len(img_embeds) + 1
104 |
105 | prompt_seg_tokens = [
106 | self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long()
107 | for i, seg in enumerate(prompt_segs)
108 | ]
109 | prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens]
110 | all_embeddings = []
111 | for i in range(len(img_embeds)):
112 | all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]])
113 | all_embeddings.append(prompt_seg_embs[-1])
114 | prompt_embs = torch.cat(all_embeddings, dim=1)
115 | return prompt_embs
116 |
117 | def use_custom_prompt(self, dataset):
118 | assert dataset is not None
119 | if DATASET_TYPE(dataset) == 'multi-choice':
120 | return True
121 | return False
122 |
123 | def build_prompt(self, line, dataset=None):
124 | assert dataset is None or isinstance(dataset, str)
125 | assert self.use_custom_prompt(dataset)
126 | tgt_path = self.dump_image(line, dataset)
127 |
128 | question = line['question']
129 | options = {
130 | cand: line[cand]
131 | for cand in string.ascii_uppercase
132 | if cand in line and not pd.isna(line[cand])
133 | }
134 | options_prompt = ''
135 | for key, item in options.items():
136 | options_prompt += f'{key}. {item}\n'
137 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
138 | context = 'N/A' if hint is None else hint
139 | mid_prompt = 'Context: ' + context + '\nQuestion: ' + question
140 | if len(options_prompt):
141 | mid_prompt += '\nOptions: ' + options_prompt
142 |
143 | if len(options):
144 | txt_prompt = 'Please answer this question by choosing the correct choice.'
145 | else:
146 | txt_prompt = 'Please answer this question directly. '
147 | prompt = txt_prompt + mid_prompt
148 | message = [dict(type='text', value=prompt)]
149 | message.extend([dict(type='image', value=s) for s in tgt_path])
150 | return message
151 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/yi_vl.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import sys
3 | import os.path as osp
4 | import warnings
5 | from PIL import Image
6 | from vlmeval.smp import get_cache_path, load, dump, splitlen
7 | from huggingface_hub import snapshot_download
8 | from .base import BaseModel
9 |
10 |
11 | """
12 | You can perform inference of Yi-VL through the following steps:
13 | 1. clone the repo https://github.com/01-ai/Yi to path-to-Yi
14 | 2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt
15 | 3. set Yi_ROOT in vlmeval/config.py
16 | Yi_ROOT = path-to-Yi
17 |
18 | You are all set now! To run a demo for Yi-VL:
19 | ```python
20 | from vlmeval import *
21 | model = supported_VLM['Yi_VL_6B']()
22 | model.generate('apple.jpg', 'What is in this image?')
23 | ```
24 | To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}`
25 | """
26 |
27 |
28 | def edit_config(repo_id):
29 | if not osp.exists(repo_id):
30 | root = get_cache_path(repo_id)
31 | else:
32 | root = repo_id
33 | assert root is not None and osp.exists(root)
34 | cfg = osp.join(root, 'config.json')
35 | data = load(cfg)
36 | mm_vision_tower = data['mm_vision_tower']
37 | if mm_vision_tower.startswith('./vit/'):
38 | data['mm_vision_tower'] = osp.join(root, mm_vision_tower)
39 | assert osp.exists(data['mm_vision_tower'])
40 | dump(data, cfg)
41 |
42 |
43 | def disable_torch_init():
44 | """
45 | Disable the redundant torch default initialization to accelerate model creation.
46 | """
47 | import torch
48 |
49 | setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
50 | setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
51 |
52 |
53 | class Yi_VL(BaseModel):
54 |
55 | INSTALL_REQ = True
56 | INTERLEAVE = False
57 |
58 | def __init__(self,
59 | model_path='01-ai/Yi-VL-6B',
60 | root=None,
61 | **kwargs):
62 |
63 | if root is None:
64 | warnings.warn(
65 | 'Please set root to the directory of Yi, '
66 | 'which is cloned from here: https://github.com/01-ai/Yi.'
67 | )
68 |
69 | self.root = osp.join(root, 'VL')
70 | sys.path.append(self.root)
71 |
72 | if splitlen(model_path, '/') == 2 and not osp.exists(model_path):
73 | if get_cache_path(model_path) is None:
74 | snapshot_download(repo_id=model_path)
75 | edit_config(model_path)
76 | elif osp.exists(model_path):
77 | edit_config(model_path)
78 |
79 | from llava.mm_utils import get_model_name_from_path, load_pretrained_model
80 | from llava.model.constants import key_info
81 |
82 | disable_torch_init()
83 | key_info['model_path'] = model_path
84 | get_model_name_from_path(model_path)
85 | self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
86 | model_path,
87 | device_map='cpu')
88 | self.model = self.model.cuda()
89 | self.conv_mode = 'mm_default'
90 |
91 | kwargs_default = dict(temperature=0.2,
92 | num_beams=1,
93 | do_sample=False,
94 | max_new_tokens=1024,
95 | top_p=None)
96 | kwargs_default.update(kwargs)
97 | self.kwargs = kwargs_default
98 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
99 |
100 | def generate_inner(self, message, dataset=None):
101 | prompt, image_path = self.message_to_promptimg(message)
102 |
103 | from llava.conversation import conv_templates
104 | from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
105 | from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token
106 |
107 | qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
108 | conv = conv_templates[self.conv_mode].copy()
109 | conv.append_message(conv.roles[0], qs)
110 | conv.append_message(conv.roles[1], None)
111 | prompt = conv.get_prompt()
112 |
113 | input_ids = (
114 | tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
115 | .unsqueeze(0)
116 | .cuda()
117 | )
118 |
119 | image = Image.open(image_path)
120 | if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad':
121 | if image.mode == 'L':
122 | background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3)
123 | else:
124 | background_color = tuple(int(x * 255) for x in self.image_processor.image_mean)
125 | image = expand2square(image, background_color)
126 | image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
127 | 'pixel_values'
128 | ][0]
129 |
130 | stop_str = conv.sep
131 | keywords = [stop_str]
132 | stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
133 | self.model = self.model.to(dtype=torch.bfloat16)
134 | with torch.inference_mode():
135 | output_ids = self.model.generate(
136 | input_ids,
137 | images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
138 | stopping_criteria=[stopping_criteria],
139 | use_cache=True,
140 | **self.kwargs)
141 |
142 | input_token_len = input_ids.shape[1]
143 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
144 | if n_diff_input_output > 0:
145 | print(
146 | f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids'
147 | )
148 | outputs = self.tokenizer.batch_decode(
149 | output_ids[:, input_token_len:], skip_special_tokens=True
150 | )[0]
151 | outputs = outputs.strip()
152 |
153 | if outputs.endswith(stop_str):
154 | outputs = outputs[: -len(stop_str)]
155 | outputs = outputs.strip()
156 | return outputs
157 |
--------------------------------------------------------------------------------
/assets/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/assets/overview.jpg
--------------------------------------------------------------------------------
/assets/taskmap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/assets/taskmap.jpg
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import base64
4 | import time
5 | from openai import OpenAI
6 | from multiprocessing import Pool
7 | import re
8 |
9 | def remove_punctuation(text):
10 | return re.sub(r'^[.,()]+|[.,()]+$', '', text)
11 |
12 | client = OpenAI(
13 | base_url='xx',
14 | api_key='xx',
15 | )
16 |
17 | def build_prompt(question, options, prediction):
18 | tmpl = (
19 | "You are an AI assistant who will help me to match an answer with several options of a single-choice question. "
20 | "You are provided with a question, several options, and an answer, and you need to find which option is most similar to the answer. "
21 | "If the meaning of all options are significantly different from the answer, output Z. "
22 | "When the options are mostly numbers, if the model outputs numbers in the same format, please do not be too precise and try to match an answer as much as possible. "\
23 | "Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n"
24 | "Example 1: \n"
25 | "Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\nAnswer: a cute teddy bear\nYour output: A\n"
26 | "Example 2: \n"
27 | "Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\nAnswer: Spider\nYour output: Z\n"
28 | "Example 3: \n"
29 | "Question: {}?\nOptions: {}\nAnswer: {}\nYour output: "
30 | )
31 | return tmpl.format(question, options, prediction)
32 |
33 |
34 | def process_data(args):
35 | data_tmp, modelname = args
36 | client = OpenAI(
37 | # base_url='https://kkkc.net/v1',
38 | # api_key='sk-YJaHfazVSf2WDkAl1bAdE17bF3Ae4923Ba888293B31d13C4',
39 | base_url='xx',
40 | api_key='xx',
41 | )
42 |
43 | options = data_tmp['options']
44 | question = data_tmp['question']
45 | prediction = data_tmp[modelname].strip()
46 |
47 | if modelname == 'Claude3' and "copyrighted material" in prediction:
48 | data_tmp[f'{modelname}_choice'] = 'Z'
49 | return data_tmp
50 | if prediction == 'image none' or prediction == 'model error or image error' or prediction == 'image error' or prediction == 'model error' or prediction == "":
51 | data_tmp[f'{modelname}_choice'] = 'Z'
52 | return data_tmp
53 | if '\u00a0' in prediction:
54 | prediction = prediction.replace('\u00a0','')
55 |
56 |
57 | prediction = remove_punctuation(prediction.strip())
58 |
59 | if prediction.strip().lower() not in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n']:
60 |
61 |
62 |
63 | content = build_prompt(question,options,prediction)
64 |
65 | try:
66 | response = client.chat.completions.create(
67 | model="gpt-4o-mini",
68 | messages=[
69 | {
70 | "role": "user",
71 | "content": [
72 | {"type": "text", "text": content},
73 | ],
74 | }
75 | ],
76 | max_tokens=512,
77 | )
78 | # print(response.choices[0].message.content)
79 | grading = response.choices[0].message.content
80 |
81 | except Exception as e:
82 | print('errror: ', e)
83 | # grading = str(e)
84 | grading = 'GPT error'
85 |
86 |
87 | data_tmp[f'{modelname}_choice'] = grading.strip()
88 | print(modelname,': ',data_tmp[f'{modelname}_choice'])
89 | return data_tmp
90 | else:
91 | data_tmp[f'{modelname}_choice'] = prediction.strip()
92 | print(modelname,': ',data_tmp[f'{modelname}_choice'])
93 | return data_tmp
94 |
95 |
96 |
97 | def main():
98 | # modelnames = ['internvl1.5-chat']
99 | # modelnames = ['Gemini','Gemini1.0']
100 | # modelnames = ['GPT4o','Gemini','Gemini1.0']
101 | # modelnames = ['Llava-interleave']
102 | modelnames = ['Llava-interleave', 'qwen_chat', 'XComposer2', 'deepseek_vl_7b', 'qwen_base', 'XComposer2_1.8b', 'flamingov2', 'deepseek_vl_1.3b', 'internvl1.5-chat', 'idefics2_8b', 'Mantis', 'idefics_9b_instruct']
103 | directorys = ['xx','xx']
104 |
105 | for directory in directorys:
106 | tasknames = os.listdir(directory)
107 | for taskname in tasknames:
108 |
109 | path = os.path.join(directory,taskname)
110 | for modelname in modelnames:
111 | path = os.path.join(directory,taskname)
112 | path = os.path.join(path,modelname)
113 |
114 | print(taskname,modelname)
115 | json_path = os.path.join(path,'metadata_info.json')
116 |
117 |
118 |
119 | if not os.path.exists(json_path):
120 | print(json_path,' not exist')
121 | continue
122 |
123 | # output_json_path = os.path.join(path,'metadata_info_choice.json')
124 | output_json_path = os.path.join(path,'metadata_info_choice.json')
125 | # if os.path.exists(output_json_path) or os.path.exists(output_json_path1):
126 | if os.path.exists(output_json_path):
127 | print(output_json_path, ' already have')
128 | continue
129 |
130 | with open(json_path,'r') as f:
131 | data = json.load(f)
132 |
133 | # 将data和modelname打包成元组列表
134 | data_with_modelname = [(data_tmp, modelname) for data_tmp in data]
135 |
136 |
137 |
138 | pool = Pool(processes=10) # Adjust the number of processes as per your machine's capability
139 | # result = pool.map(process_data, data, modelname)
140 | # 使用map方法传递打包后的元组列表
141 | result = pool.map(process_data, data_with_modelname)
142 |
143 | # output_json_path = os.path.join(path,'metadata_info_choice.json')
144 |
145 | with open(output_json_path, 'w') as f:
146 | json.dump(result, f)
147 |
148 | print(taskname,modelname,'OK')
149 |
150 |
151 |
152 | if __name__ == '__main__':
153 | main()
154 |
155 |
--------------------------------------------------------------------------------
/evaluate_correct.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import pandas as pd
4 |
5 | directorys = [
6 | 'xx'
7 | ]
8 |
9 |
10 | # Initialize global DataFrames to store data
11 | global_accuracy_df = pd.DataFrame()
12 |
13 | for directory in directorys:
14 | tasknames = sorted(os.listdir(directory))
15 |
16 | modelnames = ['GPT4o','Claude3','Gemini','Gemini1.0','Llava-interleave','Mantis','InternVL2','internvl1.5-chat','qwen_chat', 'qwen_base', 'idefics_9b_instruct','flamingov2', 'deepseek_vl_1.3b', 'XComposer2_1.8b', 'deepseek_vl_7b', 'idefics2_8b', 'XComposer2']
17 | # modelnames = ['Llava-interleave']
18 | # Initialize dictionaries to store data
19 | accuracy_data = {modelname: [] for modelname in modelnames}
20 |
21 | for taskname in tasknames:
22 | path = os.path.join(directory, taskname)
23 | for modelname in modelnames:
24 | json_path = os.path.join(path, modelname, 'metadata_info_choice.json')
25 |
26 | if os.path.exists(json_path):
27 | with open(json_path, 'r') as f:
28 | data = json.load(f)
29 | else:
30 | print('no json: ', taskname,modelname)
31 | accuracy_data[modelname].append(None)
32 | continue
33 |
34 |
35 |
36 | cnt = 0
37 | correct = 0
38 | cnt_z = 0
39 |
40 | for i in range(len(data)):
41 | data_tmp = data[i]
42 | flag = True
43 | if data_tmp[f'{modelname}_choice'].strip() == 'GPT error':
44 | print(modelname, taskname, 'GPT error')
45 | continue
46 |
47 | if data_tmp["output"] == None:
48 | flag = False
49 | continue
50 | gt = data_tmp["output"].strip().lower()
51 |
52 | if flag == False:
53 | continue
54 |
55 | cnt += 1
56 |
57 | if data_tmp[f'{modelname}_choice'].strip().lower() in gt:
58 | correct += 1
59 |
60 |
61 | accuracy_data[modelname].append(correct / cnt)
62 | print(correct / cnt, taskname, modelname)
63 |
64 |
65 |
66 | # Convert dictionaries to DataFrames
67 | accuracy_df = pd.DataFrame(accuracy_data, index=tasknames)
68 |
69 | # Append to global DataFrames
70 | global_accuracy_df = pd.concat([global_accuracy_df, accuracy_df])
71 |
72 | # Calculate the overall average for each model
73 | global_accuracy_df.loc['Overall'] = global_accuracy_df.mean()
74 |
75 | # Save global DataFrames to CSV files
76 | global_accuracy_df.to_csv('./Accuracy_data_all.csv')
77 |
78 | print("Global DataFrames have been saved as CSV files.")
79 |
80 |
81 |
--------------------------------------------------------------------------------