├── .DS_Store ├── README.md ├── VLMEvalKit ├── scripts │ ├── AI2D_preproc.ipynb │ ├── apires_scan.py │ ├── auto_run.py │ ├── cover.sh │ ├── mmb_eval_gradio.py │ ├── run.sh │ ├── srun.sh │ ├── summarize.py │ └── visualize.ipynb ├── test_Mantis.py ├── test_interlvl1.5.py ├── test_internvl2-pro.py ├── test_models.py ├── utils │ ├── __pycache__ │ │ └── conversation.cpython-39.pyc │ ├── conversation.py │ └── tools.py └── vlmeval │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── config.cpython-39.pyc │ └── tools.cpython-39.pyc │ ├── api │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── base.cpython-39.pyc │ │ ├── claude.cpython-39.pyc │ │ ├── gemini.cpython-39.pyc │ │ ├── glm_vision.cpython-39.pyc │ │ ├── gpt.cpython-39.pyc │ │ ├── gpt_int.cpython-39.pyc │ │ ├── hf_chat_model.cpython-39.pyc │ │ ├── qwen_api.cpython-39.pyc │ │ ├── qwen_vl_api.cpython-39.pyc │ │ ├── reka.cpython-39.pyc │ │ └── stepai.cpython-39.pyc │ ├── base.py │ ├── claude.py │ ├── gemini.py │ ├── glm_vision.py │ ├── gpt.py │ ├── gpt_int.py │ ├── hf_chat_model.py │ ├── qwen_api.py │ ├── qwen_vl_api.py │ ├── reka.py │ └── stepai.py │ ├── config.py │ ├── evaluate │ ├── OCRBench.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── OCRBench.cpython-39.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── coco_eval.cpython-39.pyc │ │ ├── llavabench.cpython-39.pyc │ │ ├── mathvista_eval.cpython-39.pyc │ │ ├── misc.cpython-39.pyc │ │ ├── mmvet_eval.cpython-39.pyc │ │ ├── multiple_choice.cpython-39.pyc │ │ ├── vqa_eval.cpython-39.pyc │ │ └── yes_or_no.cpython-39.pyc │ ├── coco_eval.py │ ├── llavabench.py │ ├── mathvista_eval.py │ ├── misc.py │ ├── mmvet_eval.py │ ├── multiple_choice.py │ ├── multiple_choice_mmeval.py │ ├── vqa_eval.py │ └── yes_or_no.py │ ├── inference.py │ ├── smp │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── file.cpython-39.pyc │ │ ├── log.cpython-39.pyc │ │ ├── misc.cpython-39.pyc │ │ └── vlm.cpython-39.pyc │ ├── file.py │ ├── log.py │ ├── misc.py │ └── vlm.py │ ├── tools.py │ ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── custom_prompt.cpython-39.pyc │ │ ├── dataset.cpython-39.pyc │ │ ├── dataset_config.cpython-39.pyc │ │ ├── matching_util.cpython-39.pyc │ │ ├── mp_util.cpython-39.pyc │ │ └── result_transfer.cpython-39.pyc │ ├── custom_prompt.py │ ├── dataset.py │ ├── dataset_config.py │ ├── matching_util.py │ ├── mp_util.py │ └── result_transfer.py │ └── vlm │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── base.cpython-39.pyc │ ├── bunnyllama3.cpython-39.pyc │ ├── cogvlm.cpython-39.pyc │ ├── deepseek_vl.cpython-39.pyc │ ├── emu.cpython-39.pyc │ ├── idefics.cpython-39.pyc │ ├── instructblip.cpython-39.pyc │ ├── internvl_chat.cpython-39.pyc │ ├── mgm.cpython-39.pyc │ ├── minicpm_v.cpython-39.pyc │ ├── minigpt4.cpython-39.pyc │ ├── mmalaya.cpython-39.pyc │ ├── monkey.cpython-39.pyc │ ├── mplug_owl2.cpython-39.pyc │ ├── omnilmm.cpython-39.pyc │ ├── open_flamingo.cpython-39.pyc │ ├── paligemma.cpython-39.pyc │ ├── pandagpt.cpython-39.pyc │ ├── phi3_vision.cpython-39.pyc │ ├── qh_360vl.cpython-39.pyc │ ├── qwen_vl.cpython-39.pyc │ ├── transcore_m.cpython-39.pyc │ ├── visualglm.cpython-39.pyc │ ├── vxverse.cpython-39.pyc │ ├── wemm.cpython-39.pyc │ └── yi_vl.cpython-39.pyc │ ├── base.py │ ├── bunnyllama3.py │ ├── cogvlm.py │ ├── deepseek_vl.py │ ├── emu.py │ ├── idefics.py │ ├── instructblip.py │ ├── internvl_chat.py │ ├── llava │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── llava.cpython-39.pyc │ │ └── llava_xtuner.cpython-39.pyc │ ├── llava.py │ └── llava_xtuner.py │ ├── mgm.py │ ├── minicpm_v.py │ ├── minigpt4.py │ ├── misc │ ├── blip2_instruct_vicuna13b.yaml │ ├── blip2_instruct_vicuna7b.yaml │ ├── minigpt4_13b_eval.yaml │ ├── minigpt4_7b_eval.yaml │ └── minigptv2_eval.yaml │ ├── mmalaya.py │ ├── monkey.py │ ├── mplug_owl2.py │ ├── omnilmm.py │ ├── open_flamingo.py │ ├── paligemma.py │ ├── pandagpt.py │ ├── phi3_vision.py │ ├── qh_360vl.py │ ├── qwen_vl.py │ ├── transcore_m.py │ ├── visualglm.py │ ├── vxverse.py │ ├── wemm.py │ ├── xcomposer │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── sharecaptioner.cpython-39.pyc │ │ ├── xcomposer.cpython-39.pyc │ │ ├── xcomposer2.cpython-39.pyc │ │ └── xcomposer2_4KHD.cpython-39.pyc │ ├── sharecaptioner.py │ ├── xcomposer.py │ ├── xcomposer2.py │ └── xcomposer2_4KHD.py │ └── yi_vl.py ├── assets ├── overview.jpg └── taskmap.jpg ├── evaluate.py └── evaluate_correct.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Best Practice 2 | 3 | **We strongly recommend using [VLMEevalKit](https://github.com/open-compass/VLMEvalKit) for its useful features and ready-to-use LVLM implementations**. 4 | 5 | # MMIU 6 | 7 |

8 | Quick Start | 9 | HomePage | 10 | arXiv | 11 | Dataset | 12 | Citation
13 |

14 | 15 | 16 | This repository is the official implementation of [MMIU](https://arxiv.org/abs/2408.02718). 17 | 18 | > [MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models](https://arxiv.org/abs/2408.02718) 19 | > Fanqing Meng\*, Jin Wang\*, Chuanhao Li\*, Quanfeng Lu, Hao Tian, Jiaqi Liao, Xizhou Zhu, Jifeng Dai, Yu Qiao, Ping Luo, Kaipeng Zhang\#, Wenqi Shao\# 20 | > \* MFQ, WJ and LCH contribute equally. 21 | > \# SWQ (shaowenqi@pjlab.org.cn) and ZKP (zhangkaipeng@pjlab.org.cn) are correponding authors. 22 | 23 | ## 💡 News 24 | 25 | - `2024/08/13`: We have released the codes. 26 | 27 | - `2024/08/08`: We have released the dataset at https://huggingface.co/datasets/FanqingM/MMIU-Benchmark 🔥🔥🔥 28 | 29 | - `2024/08/05`: The datasets and codes are coming soon! 🔥🔥🔥 30 | 31 | - `2024/08/05`: The technical report of [MMIU](https://arxiv.org/abs/2408.02718) is released! And check our [project page](https://mmiu-bench.github.io/)! 🔥🔥🔥 32 | 33 | 34 | ## Introduction 35 | Multimodal Multi-image Understanding (MMIU) benchmark, a comprehensive evaluation suite designed to assess LVLMs across a wide range of multi-image tasks. MMIU encompasses 7 types of multi-image relationships, 52 tasks, 77K images, and 11K meticulously curated multiple-choice questions, making it the most extensive benchmark of its kind. 36 | ![overview](assets/overview.jpg) 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | ## Evaluation Results Overview 46 | - The closed-source proprietary model GPT-4o from OpenAI has taken a leading position in MMIU, surpassing other models such as InternVL2-pro, InternVL1.5-chat, Claude3.5-Sonnet, and Gemini1.5 flash. Note that the open-source models InternVL2-pro. 47 | 48 | 49 | - Some powerful LVLMs like InternVL1.5 and GLM4V whose pre-training data do not contain multi-image content even outperform many multi-image models which undergo multi-image supervised fine-tuning (SFT), indicating the strong capacity in single-image understanding is the foundation of multi-image comprehension. 50 | - By comparing performance at the level of image relationships, we conclude that LVLM excels at understanding semantic content in multi-image scenarios but has weaker performance in comprehending temporal and spatial relationships in multi-image contexts. 51 | - The analysis based on the task map reveals that models perform better on high-level understanding tasks such as video captioning which are in-domain tasks, but struggle with 3D perception tasks such as 3D detection and temporal reasoning tasks such as image ordering which are out-of-domain tasks. 52 | - By task learning difficulty analysis, tasks involving ordering, retrieval and massive images cannot be overfitted by simple SFT, suggesting that additional pre-training data or training techniques should be incorporated for improvement. 53 | ![taskmap](assets/taskmap.jpg) 54 | 55 | 56 | ## 🏆 Leaderboard 57 | 58 | 59 | 60 | | Rank | Model | Score | 61 | | ---- | ---------------------- | ----- | 62 | | **1** | **GPT4o** | **55.72** | 63 | | 2 | Gemini | 53.41 | 64 | | 3 | Claude3 | 53.38 | 65 | | **4** | **InternVL2** | **50.30** | 66 | | 5 | Mantis | 45.58 | 67 | | 6 | Gemini1.0 | 40.25 | 68 | | 7 | internvl1.5-chat | 37.39 | 69 | | 8 | Llava-interleave | 32.37 | 70 | | 9 | idefics2_8b | 27.80 | 71 | | 10 | glm-4v-9b | 27.02 | 72 | | 11 | deepseek_vl_7b | 24.64 | 73 | | 12 | XComposer2_1.8b | 23.46 | 74 | | 13 | deepseek_vl_1.3b | 23.21 | 75 | | 14 | flamingov2 | 22.26 | 76 | | 15 | llava_next_vicuna_7b | 22.25 | 77 | | 16 | XComposer2 | 21.91 | 78 | | 17 | MiniCPM-Llama3-V-2_5 | 21.61 | 79 | | 18 | llava_v1.5_7b | 19.19 | 80 | | 19 | sharegpt4v_7b | 18.52 | 81 | | 20 | sharecaptioner | 16.10 | 82 | | 21 | qwen_chat | 15.92 | 83 | | 22 | monkey-chat | 13.74 | 84 | | 23 | idefics_9b_instruct | 12.84 | 85 | | 24 | qwen_base | 5.16 | 86 | | - | Frequency Guess | 31.5 | 87 | | - | Random Guess | 27.4 | 88 | 89 | 90 | 91 | 92 | ## 🚀 Quick Start 93 | 94 | Here, we mainly use the VLMEvalKit framework for testing, with some separate tests as well. Specifically, for multi-image models, we include the following models: 95 | 96 | **transformers == 33.0** 97 | 98 | - `XComposer2` 99 | - `XComposer2_1.8b` 100 | - `qwen_base` 101 | - `idefics_9b_instruct` 102 | - `qwen_chat` 103 | - `flamingov2` 104 | 105 | **transformers == 37.0** 106 | - `deepseek_vl_1.3b` 107 | - `deepseek_vl_7b` 108 | 109 | **transformers == 40.0** 110 | 111 | - `idefics2_8b` 112 | 113 | For single-image models, we include the following: 114 | 115 | **transformers == 33.0** 116 | 117 | - `sharecaptioner` 118 | - `monkey-chat` 119 | 120 | **transformers == 37.0** 121 | 122 | - `sharegpt4v_7b` 123 | - `llava_v1.5_7b` 124 | - `glm-4v-9b` 125 | 126 | **transformers == 40.0** 127 | 128 | - `llava_next_vicuna_7b` 129 | - `MiniCPM-Llama3-V-2_5` 130 | 131 | We use the VLMEvalKit framework for testing. You can refer to the code in `VLMEvalKit/test_models.py`. Additionally, for closed-source models, please replace the following part of the code by following the example here: 132 | 133 | ```python 134 | response = model.generate(tmp) # tmp = image_paths + [question] 135 | ``` 136 | 137 | For other open-source models, we have provided reference code for `Mantis` and `InternVL1.5-chat`. For `LLava-Interleave`, please refer to the original repository. 138 | 139 | 140 | 141 | 142 | ## 💐 Acknowledgement 143 | 144 | We expressed sincerely gratitude for the projects listed following: 145 | - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) provides useful out-of-box tools and implements many adavanced LVLMs. Thanks for their selfless dedication. 146 | - The Team of InternVL for apis. 147 | 148 | 149 | ## 📧 Contact 150 | If you have any questions, feel free to contact Fanqing Meng with mengfanqing33@gmail.com 151 | 152 | 153 | 154 | ## 🖊️ Citation 155 | If you feel MMIU useful in your project or research, please kindly use the following BibTeX entry to cite our paper. Thanks! 156 | 157 | ``` 158 | @article{meng2024mmiu, 159 | title={MMIU: Multimodal Multi-image Understanding for Evaluating Large Vision-Language Models}, 160 | author={Meng, Fanqing and Wang, Jin and Li, Chuanhao and Lu, Quanfeng and Tian, Hao and Liao, Jiaqi and Zhu, Xizhou and Dai, Jifeng and Qiao, Yu and Luo, Ping and others}, 161 | journal={arXiv preprint arXiv:2408.02718}, 162 | year={2024} 163 | } 164 | ``` 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /VLMEvalKit/scripts/apires_scan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from vlmeval import * 3 | FAIL_MSG = 'Failed to obtain answer via API.' 4 | 5 | root = sys.argv[1] 6 | if root[-1] in '/\\': 7 | root = root[:-1] 8 | 9 | model_name = root.split('/')[-1] 10 | datasets = list(dataset_URLs) 11 | 12 | for d in datasets: 13 | fname = f'{model_name}_{d}.xlsx' 14 | pth = osp.join(root, fname) 15 | if osp.exists(pth): 16 | data = load(pth) 17 | # Detect Failure 18 | assert 'prediction' in data 19 | fail = [FAIL_MSG in x for x in data['prediction']] 20 | if sum(fail): 21 | nfail = sum(fail) 22 | ntot = len(fail) 23 | print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ') 24 | 25 | eval_files = ls(root, match=f'{model_name}_{d}_') 26 | eval_files = [x for x in eval_files if listinstr(['openai', 'gpt'], x) and x.endswith('.xlsx')] 27 | 28 | assert len(eval_files) == 1 29 | eval_file = eval_files[0] 30 | data = load(eval_file) 31 | 32 | if listinstr(['MathVista', 'MMVet'], d): 33 | bad = [x for x in data['log'] if 'All 5 retries failed.' in x] 34 | if len(bad): 35 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 36 | elif d == 'LLaVABench': 37 | sub = data[data['gpt4_score'] == -1] 38 | sub = sub[sub['gpt4_score'] == -1] 39 | if len(sub): 40 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.') 41 | else: 42 | bad = [x for x in data['log'] if FAIL_MSG in x] 43 | if len(bad): 44 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 45 | -------------------------------------------------------------------------------- /VLMEvalKit/scripts/auto_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from vlmeval.smp import * 3 | from vlmeval.config import supported_VLM 4 | 5 | def is_api(x): 6 | return getattr(supported_VLM[x].func, 'is_api', False) 7 | 8 | models = list(supported_VLM) 9 | models = [x for x in models if 'fs' not in x] 10 | models = [x for x in models if not is_api(x)] 11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2'] 12 | models = [x for x in models if x not in exclude_list] 13 | 14 | def is_large(x): 15 | return '80b' in x or 'emu2' in x or '34B' in x 16 | 17 | small_models = [x for x in models if not is_large(x)] 18 | large_models = [x for x in models if is_large(x)] 19 | models = small_models + large_models 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--data', type=str, nargs='+', required=True) 23 | args = parser.parse_args() 24 | 25 | # Skip some models 26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] 27 | 28 | for m in models: 29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')] 30 | if len(unknown_datasets) == 0: 31 | continue 32 | dataset_str = ' '.join(unknown_datasets) 33 | if '80b' in m: 34 | cmd = f'python run.py --data {dataset_str} --model {m}' 35 | else: 36 | cmd = f'bash run.sh --data {dataset_str} --model {m}' 37 | print(cmd) 38 | os.system(cmd) -------------------------------------------------------------------------------- /VLMEvalKit/scripts/cover.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | cp $DIR/../config.py $DIR/../vlmeval/ 4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/ -------------------------------------------------------------------------------- /VLMEvalKit/scripts/mmb_eval_gradio.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.evaluate.multiple_choice import multiple_choice_eval 3 | import gradio as gr 4 | 5 | HEADER = """ 6 | # Welcome to MMBench👏👏 7 | We are delighted that you are willing to submit the evaluation results to the MMBench official website! The evaluation service currently can handle submissions of MMBench, MMBench-CN, and CCBench. We use `gpt-3.5-turbo-0613` to help answer matching. Evaluation Codes in VLMEvalKit: https://github.com/open-compass/VLMEvalKit. Please adopt / follow the implementation of VLMEvalKit to generate the submission files. 8 | Moreover, this is a temporary solution, which **does not support ChatGPT-based answer extraction**. So you need to make sure values in the `prediction` field of your submission files are single characters in A, B, C, D. Other free-form answers can not be recognized by the evaluation script and will be marked as **WRONG**. 9 | 10 | The evaluation script is available at https://github.com/open-compass/VLMEvalKit/tree/main/scripts/mmb_eval_gradio.py 11 | Please contact `opencompass@pjlab.org.cn` for any inquirys about this script. 12 | """ 13 | 14 | def upload_file(file): 15 | file_path = file.name 16 | return file_path 17 | 18 | def prepare_file(file_name): 19 | file_md5 = md5(file_name) 20 | root = LMUDataRoot() 21 | root = osp.join(root, 'eval_server') 22 | os.makedirs(root, exist_ok=True) 23 | suffix = file_name.split('.')[-1] 24 | if suffix not in ['xlsx', 'tsv', 'csv']: 25 | return False, "Please submit a file that ends with `.xlsx`, `.tsv`, or `.csv`" 26 | new_file_name = osp.join(root, f'{file_md5}.{suffix}') 27 | shutil.move(file_name, new_file_name) 28 | eval_file = new_file_name 29 | try: 30 | data = load(eval_file) 31 | except: 32 | return False, "Your excel file can not be successfully loaded by `pd.read_excel`, please double check and submit again. " 33 | for k in data.keys(): 34 | data[k.lower() if k not in 'ABCD' else k] = data.pop(k) 35 | if "index" not in data: 36 | return False, "Your excel file should have a column named `index`, please double check and submit again" , {} 37 | if "prediction" not in data: 38 | return False, "Your excel file should have a column named `prediction`, please double check and submit again" , {} 39 | for ch in 'ABCD': 40 | if ch not in data: 41 | return False, f"Your excel file should have a column named `{ch}`, please double check and submit again" , {} 42 | dump(data, eval_file) 43 | return True, eval_file 44 | 45 | def determine_dataset(eval_file): 46 | data = load(eval_file) 47 | def cn_ratio(data): 48 | iscn = [cn_string(x) for x in data['question']] 49 | return np.mean(iscn) 50 | if len(data) < 2500 and 'l2-category' not in data: 51 | return 'CCBench' if cn_ratio(data) > 0.5 else "Unknown" 52 | else: 53 | return 'MMBench_CN' if cn_ratio(data) > 0.5 else "MMBench" 54 | 55 | def reformat_acc(acc): 56 | splits = set(acc['split']) 57 | keys = list(acc.keys()) 58 | keys.remove('split') 59 | nacc = {'Category': []} 60 | for sp in splits: 61 | nacc[sp.upper()] = [] 62 | for k in keys: 63 | nacc['Category'].append(k) 64 | for sp in splits: 65 | nacc[sp.upper()].append(acc[acc['split'] == sp].iloc[0][k] * 100) 66 | return pd.DataFrame(nacc) 67 | 68 | def evaluate(file): 69 | file_name = file.name 70 | flag, eval_file = prepare_file(file_name) 71 | if not flag: 72 | return "Error: " + eval_file 73 | dataset = determine_dataset(eval_file) 74 | if dataset == 'Unknown': 75 | return "Error: Cannot determine the dataset given your submitted file. " 76 | 77 | eval_id = eval_file.split('/')[-1].split('.')[0] 78 | ret = f"Evaluation ID: {eval_id}\n" 79 | timestamp = datetime.datetime.now().strftime('%Y.%m.%d %H:%M:%S') 80 | ret += f'Evaluation Timestamp: {timestamp}\n' 81 | acc = multiple_choice_eval(eval_file, dataset=dataset, model='exact_matching') 82 | nacc = reformat_acc(acc).round(1) 83 | return ret, nacc 84 | 85 | with gr.Blocks() as demo: 86 | gr.Markdown(HEADER) 87 | file_output = gr.File() 88 | upload_button = gr.UploadButton("Click to upload you prediction files for a supported benchmark") 89 | upload_button.upload(upload_file, upload_button, file_output) 90 | 91 | btn = gr.Button("🚀 Evaluate") 92 | eval_log = gr.Textbox(label="Evaluation Log", placeholder="Your evaluation log will be displayed here") 93 | df_empty = pd.DataFrame([], columns=['Evaluation Result']) 94 | eval_result = gr.components.DataFrame(value=df_empty) 95 | btn.click(evaluate, inputs=[file_output], outputs=[eval_log, eval_result]) 96 | 97 | if __name__ == '__main__': 98 | demo.launch(server_name='0.0.0.0', debug=True, show_error=True) -------------------------------------------------------------------------------- /VLMEvalKit/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | export GPU=$(nvidia-smi --list-gpus | wc -l) 4 | torchrun --nproc-per-node=$GPU run.py ${@:1} -------------------------------------------------------------------------------- /VLMEvalKit/scripts/srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2} -------------------------------------------------------------------------------- /VLMEvalKit/scripts/summarize.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.utils.dataset_config import dataset_URLs 3 | 4 | def get_score(model, dataset): 5 | 6 | file_name = f'{model}/{model}_{dataset}' 7 | if listinstr([ 8 | 'CCBench', 'MMBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset): 9 | file_name += '_acc.csv' 10 | elif listinstr(['MME', 'Hallusion', 'LLaVABench'], dataset): 11 | file_name += '_score.csv' 12 | elif listinstr(['MMVet', 'MathVista'], dataset): 13 | file_name += '_gpt-4-turbo_score.csv' 14 | elif listinstr(['COCO', 'OCRBench'], dataset): 15 | file_name += '_score.json' 16 | else: 17 | raise NotImplementedError 18 | 19 | if not osp.exists(file_name): 20 | return {} 21 | 22 | data = load(file_name) 23 | ret = {} 24 | if dataset == 'CCBench': 25 | ret[dataset] = data['Overall'][0] * 100 26 | elif dataset == 'MMBench': 27 | for n, a in zip(data['split'], data['Overall']): 28 | if n == 'dev': 29 | ret['MMBench_DEV_EN'] = a * 100 30 | elif n == 'test': 31 | ret['MMBench_TEST_EN'] = a * 100 32 | elif dataset == 'MMBench_CN': 33 | for n, a in zip(data['split'], data['Overall']): 34 | if n == 'dev': 35 | ret['MMBench_DEV_CN'] = a * 100 36 | elif n == 'test': 37 | ret['MMBench_TEST_CN'] = a * 100 38 | elif listinstr(['SEEDBench', 'ScienceQA', 'MMBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset): 39 | ret[dataset] = data['Overall'][0] * 100 40 | elif 'MME' == dataset: 41 | ret[dataset] = data['perception'][0] + data['reasoning'][0] 42 | elif 'MMVet' == dataset: 43 | data = data[data['Category'] == 'Overall'] 44 | ret[dataset] = float(data.iloc[0]['acc']) 45 | elif 'HallusionBench' == dataset: 46 | data = data[data['split'] == 'Overall'] 47 | for met in ['aAcc', 'qAcc', 'fAcc']: 48 | ret[dataset + f' ({met})'] = float(data.iloc[0][met]) 49 | elif 'MMMU' in dataset: 50 | data = data[data['split'] == 'validation'] 51 | ret['MMMU (val)'] = float(data.iloc[0]['Overall']) * 100 52 | elif 'MathVista' in dataset: 53 | data = data[data['Task&Skill'] == 'Overall'] 54 | ret[dataset] = float(data.iloc[0]['acc']) 55 | elif 'LLaVABench' in dataset: 56 | data = data[data['split'] == 'overall'].iloc[0] 57 | ret[dataset] = float(data['Relative Score (main)']) 58 | elif 'OCRBench' in dataset: 59 | ret[dataset] = data['Final Score'] 60 | 61 | return ret 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('--data', type=str, nargs='+', default=[]) 66 | parser.add_argument("--model", type=str, nargs='+', required=True) 67 | args = parser.parse_args() 68 | return args 69 | 70 | def gen_table(models, datasets): 71 | res = defaultdict(dict) 72 | for m in models: 73 | for d in datasets: 74 | try: 75 | res[m].update(get_score(m, d)) 76 | except: 77 | pass 78 | keys = [] 79 | for m in models: 80 | for d in res[m]: 81 | keys.append(d) 82 | keys = list(set(keys)) 83 | keys.sort() 84 | final = defaultdict(list) 85 | for m in models: 86 | final['Model'].append(m) 87 | for k in keys: 88 | if k in res[m]: 89 | final[k].append(res[m][k]) 90 | else: 91 | final[k].append(None) 92 | final = pd.DataFrame(final) 93 | dump(final, 'summ.csv') 94 | if len(final) >= len(final.iloc[0].keys()): 95 | print(tabulate(final)) 96 | else: 97 | print(tabulate(final.T)) 98 | 99 | if __name__ == '__main__': 100 | args = parse_args() 101 | if args.data == []: 102 | args.data = list(dataset_URLs) 103 | gen_table(args.model, args.data) -------------------------------------------------------------------------------- /VLMEvalKit/test_Mantis.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | import torch 4 | from PIL import Image 5 | from io import BytesIO 6 | 7 | from transformers import AutoProcessor, AutoModelForVision2Seq 8 | from transformers.image_utils import load_image 9 | import os 10 | import json 11 | import time 12 | import random 13 | 14 | 15 | processor = AutoProcessor.from_pretrained("TIGER-Lab/Mantis-8B-Idefics2") # do_image_splitting is False by default 16 | model = AutoModelForVision2Seq.from_pretrained( 17 | "TIGER-Lab/Mantis-8B-Idefics2", 18 | device_map="auto" 19 | ) 20 | generation_kwargs = { 21 | "max_new_tokens": 1024, 22 | "num_beams": 1, 23 | "do_sample": False 24 | } 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | def call_mantis(image_paths,question): 33 | # Note that passing the image urls (instead of the actual pil images) to the processor is also possible 34 | images = [] 35 | try: 36 | for image_path in image_paths: 37 | image = load_image(image_path) 38 | images.append(image) 39 | except Exception as e: 40 | print(e) 41 | return 'image error' 42 | 43 | content_list = [] 44 | for i in range(len(images)): 45 | content_list.append({"type": "image"}) 46 | content_list.append({"type": "text", "text": question}) 47 | 48 | 49 | messages = [ 50 | { 51 | "role": "user", 52 | "content": content_list 53 | } 54 | ] 55 | 56 | try: 57 | prompt = processor.apply_chat_template(messages, add_generation_prompt=True) 58 | inputs = processor(text=prompt, images=images, return_tensors="pt") 59 | inputs = {k: v.to(model.device) for k, v in inputs.items()} 60 | 61 | # Generate 62 | generated_ids = model.generate(**inputs, **generation_kwargs) 63 | response = processor.batch_decode(generated_ids[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True) 64 | print("ASSISTANT: ", response[0]) 65 | return response[0] 66 | except Exception as e: 67 | print(e) 68 | print('model error') 69 | return 'model error' 70 | 71 | 72 | json_path = 'all.json' 73 | 74 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval', 75 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis', 76 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking'] 77 | with open(json_path,'r') as f: 78 | data_all = json.load(f) 79 | 80 | result = [] 81 | 82 | 83 | 84 | for modelname in models: 85 | output_dir = os.path.join('./result') 86 | 87 | output_dir = os.path.join(output_dir,modelname) 88 | if not os.path.exists(output_dir): 89 | os.makedirs(output_dir) 90 | 91 | output_path = os.path.join(output_dir,'metadata_info.json') 92 | 93 | for task_data in data_all: 94 | 95 | context = task_data["context"] 96 | question = task_data["question"] 97 | 98 | tmp = [] 99 | image_flag = True 100 | 101 | for image_path in task_data["input_image_path"]: 102 | 103 | tmp.append(image_path) 104 | if not os.path.exists(image_path): 105 | image_flag = False 106 | break 107 | 108 | if image_flag == False: 109 | response = 'image none' 110 | task_data[modelname] = response 111 | print(modelname, task,len(tmp), ': ',response) 112 | result.append(task_data) 113 | continue 114 | 115 | 116 | 117 | try: 118 | 119 | if task_data['task'] in tasks_exist: 120 | question = question + '\n' + context 121 | else: 122 | question = context + '\n' + question 123 | question = question + '\nPlease answer the option directly like A,B,C,D...' 124 | 125 | response = call_mantis(tmp,question) 126 | task_data[modelname] = response 127 | print(modelname, task,len(tmp), ': ',response) 128 | except: 129 | response = 'model error or image error' 130 | task_data[modelname] = response 131 | print(modelname, task,len(tmp),': ',response) 132 | result.append(task_data) 133 | 134 | 135 | 136 | with open(output_path,'w') as f: 137 | json.dump(result,f) 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /VLMEvalKit/test_interlvl1.5.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModel 2 | import torch 3 | import torchvision.transforms as T 4 | from PIL import Image 5 | import random 6 | from torchvision.transforms.functional import InterpolationMode 7 | import os 8 | import json 9 | 10 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 11 | IMAGENET_STD = (0.229, 0.224, 0.225) 12 | 13 | 14 | def build_transform(input_size): 15 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD 16 | transform = T.Compose([ 17 | T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), 18 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 19 | T.ToTensor(), 20 | T.Normalize(mean=MEAN, std=STD) 21 | ]) 22 | return transform 23 | 24 | 25 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): 26 | best_ratio_diff = float('inf') 27 | best_ratio = (1, 1) 28 | area = width * height 29 | for ratio in target_ratios: 30 | target_aspect_ratio = ratio[0] / ratio[1] 31 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 32 | if ratio_diff < best_ratio_diff: 33 | best_ratio_diff = ratio_diff 34 | best_ratio = ratio 35 | elif ratio_diff == best_ratio_diff: 36 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 37 | best_ratio = ratio 38 | return best_ratio 39 | 40 | 41 | def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): 42 | orig_width, orig_height = image.size 43 | aspect_ratio = orig_width / orig_height 44 | 45 | # calculate the existing image aspect ratio 46 | target_ratios = set( 47 | (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if 48 | i * j <= max_num and i * j >= min_num) 49 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 50 | 51 | # find the closest aspect ratio to the target 52 | target_aspect_ratio = find_closest_aspect_ratio( 53 | aspect_ratio, target_ratios, orig_width, orig_height, image_size) 54 | 55 | # calculate the target width and height 56 | target_width = image_size * target_aspect_ratio[0] 57 | target_height = image_size * target_aspect_ratio[1] 58 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 59 | 60 | # resize the image 61 | resized_img = image.resize((target_width, target_height)) 62 | processed_images = [] 63 | for i in range(blocks): 64 | box = ( 65 | (i % (target_width // image_size)) * image_size, 66 | (i // (target_width // image_size)) * image_size, 67 | ((i % (target_width // image_size)) + 1) * image_size, 68 | ((i // (target_width // image_size)) + 1) * image_size 69 | ) 70 | # split the image 71 | split_img = resized_img.crop(box) 72 | processed_images.append(split_img) 73 | assert len(processed_images) == blocks 74 | if use_thumbnail and len(processed_images) != 1: 75 | thumbnail_img = image.resize((image_size, image_size)) 76 | processed_images.append(thumbnail_img) 77 | return processed_images 78 | 79 | 80 | def load_image(image_file, input_size=448, max_num=6): 81 | image = Image.open(image_file).convert('RGB') 82 | transform = build_transform(input_size=input_size) 83 | images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) 84 | pixel_values = [transform(image) for image in images] 85 | pixel_values = torch.stack(pixel_values) 86 | return pixel_values 87 | 88 | path = "/mnt/hwfile/gveval/mengfanqing/InternVL-Chat-V1-5" 89 | # If you have an 80G A100 GPU, you can put the entire model on a single GPU. 90 | model = AutoModel.from_pretrained( 91 | path, 92 | torch_dtype=torch.bfloat16, 93 | low_cpu_mem_usage=True, 94 | trust_remote_code=True).eval().cuda() 95 | 96 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 97 | 98 | 99 | generation_config = dict( 100 | num_beams=1, 101 | max_new_tokens=4096, 102 | do_sample=False, 103 | ) 104 | 105 | 106 | 107 | 108 | json_path = 'all.json' 109 | 110 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval', 111 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis', 112 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking'] 113 | with open(json_path,'r') as f: 114 | data_all = json.load(f) 115 | 116 | result = [] 117 | 118 | 119 | 120 | for modelname in models: 121 | output_dir = os.path.join('./result') 122 | 123 | output_dir = os.path.join(output_dir,modelname) 124 | if not os.path.exists(output_dir): 125 | os.makedirs(output_dir) 126 | 127 | output_path = os.path.join(output_dir,'metadata_info.json') 128 | 129 | for task_data in data_all: 130 | 131 | context = task_data["context"] 132 | question = task_data["question"] 133 | 134 | tmp = [] 135 | image_flag = True 136 | 137 | for image_path in task_data["input_image_path"]: 138 | 139 | tmp.append(image_path) 140 | if not os.path.exists(image_path): 141 | image_flag = False 142 | break 143 | 144 | if image_flag == False: 145 | response = 'image none' 146 | task_data[modelname] = response 147 | print(modelname, task,len(tmp), ': ',response) 148 | result.append(task_data) 149 | continue 150 | 151 | 152 | 153 | try: 154 | 155 | if task_data['task'] in tasks_exist: 156 | question = question + '\n' + context 157 | else: 158 | question = context + '\n' + question 159 | question = question + '\nPlease answer the option directly like A,B,C,D...' 160 | pixel_values = torch.cat(tmp, dim=0) 161 | response, _ = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True) 162 | task_data[modelname] = response 163 | print(modelname, task,len(tmp), ': ',response) 164 | except: 165 | response = 'model error or image error' 166 | task_data[modelname] = response 167 | print(modelname, task,len(tmp),': ',response) 168 | result.append(task_data) 169 | 170 | 171 | 172 | with open(output_path,'w') as f: 173 | json.dump(result,f) 174 | 175 | -------------------------------------------------------------------------------- /VLMEvalKit/test_internvl2-pro.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | 4 | url = "xx" # 替换为实际的API接口(API) 5 | api_key = "InternVL-2-Pro_da046f58b9adc971c2a9f002d8ad4e5704cadf76161268db240bf3afea8b9d78_gI8iJTcO" # 替换为实际生成的API密钥(KEY) 6 | 7 | 8 | # high-level obj 9 | context = "You are given a GUI navigation task that includes current screenshot images, historical screenshot images, and corresponding actions, where the last image represents the current screenshot and the preceding images are historical screenshots. The task is: Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone.\nThe historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\n\nPlease predict the next action to complete the task.\nSelect from the following choices.\nA: SCROLL: LEFT\nB: SCROLL: UP\nC: SCROLL: DOWN\nD: CLICK: (31, 960)\n" 10 | question = "The last image represents the current screenshot and the preceding images are historical screenshots. The historical actions are: step 1: CLICK: (603, 801)\nstep 2: CLICK: (190, 427)\nstep 3: CLICK: (834, 565)\nI want to Open the Firefox Browser to search for the best video blogs on travel vlogs. Then, go to the Setting app to turn up the brightness on your phone. Finally, open the YouTube app to follow the video blogs you found. Which action should I do next?" 11 | question = context + '\n' + question 12 | question = question + '\nPlease answer the option directly like A,B,C,D...' 13 | 14 | file_paths = [ 15 | "3873605806112821_0.png", 16 | "3873605806112821_1.png", 17 | "3873605806112821_2.png", 18 | "3873605806112821_3.png" 19 | ] 20 | 21 | 22 | 23 | 24 | files = [('files', open(file_path, 'rb')) for file_path in file_paths] 25 | data = { 26 | 'question': question, 27 | 'api_key': api_key 28 | } 29 | 30 | while True: 31 | try: 32 | response = requests.post(url, files=files, data=data) 33 | if response.status_code == 200: 34 | print("Response:", response.json().get("response", "No response key found in the JSON.")) 35 | break # Exit the loop if the request is successful 36 | else: 37 | print("Error:", response.status_code, response.text) 38 | except requests.exceptions.RequestException as e: 39 | print(f"Request failed: {e}") 40 | 41 | # Wait for a while before retrying 42 | time.sleep(2) 43 | 44 | 45 | print('------------------------------') 46 | -------------------------------------------------------------------------------- /VLMEvalKit/test_models.py: -------------------------------------------------------------------------------- 1 | 2 | from vlmeval.config import supported_VLM 3 | import os 4 | import json 5 | 6 | # transformers == 33.0 7 | # ['XComposer2','XComposer2_1.8b','qwen_base','idefics_9b_instruct','qwen_chat', 'flamingov2'] 8 | 9 | 10 | # transformers == 37.0 11 | # ['deepseek_vl_1.3b','deepseek_vl_7b'] 12 | 13 | # transformers == 40.0 14 | # ['idefics2_8b'] 15 | 16 | 17 | models = ['XComposer2','XComposer2_1.8b','qwen_base','idefics_9b_instruct','qwen_chat', 'flamingov2'] 18 | json_path = 'all.json' 19 | 20 | tasks_exist = ['person_reid', 'multiple_image_captioning', 'spot_the_similarity', 'face_retrieval', 'sketch2image_retrieval', 'handwritten_retrieval', 'spot_the_diff', 'image2image_retrieval', 'vehicle_retrieval', 'text2image_retrieval', 21 | 'general_action_recognition', 'video_captioning', 'next_img_prediction', 'temporal_ordering', 'meme_vedio_understanding', 'action_quality_assessment', 'temporal_localization', 'mevis', 22 | 'ravens_progressive_matrices', 'threed_indoor_recognition', 'point_tracking', 'threed_cad_recognition', 'single_object_tracking'] 23 | with open(json_path,'r') as f: 24 | data_all = json.load(f) 25 | 26 | result = [] 27 | 28 | 29 | 30 | for modelname in models: 31 | model = supported_VLM[modelname]() 32 | output_dir = os.path.join('./result') 33 | 34 | output_dir = os.path.join(output_dir,modelname) 35 | if not os.path.exists(output_dir): 36 | os.makedirs(output_dir) 37 | 38 | output_path = os.path.join(output_dir,'metadata_info.json') 39 | 40 | for task_data in data_all: 41 | 42 | context = task_data["context"] 43 | question = task_data["question"] 44 | 45 | tmp = [] 46 | image_flag = True 47 | 48 | for image_path in task_data["input_image_path"]: 49 | 50 | tmp.append(image_path) 51 | if not os.path.exists(image_path): 52 | image_flag = False 53 | break 54 | 55 | if image_flag == False: 56 | response = 'image none' 57 | task_data[modelname] = response 58 | print(modelname, task,len(tmp), ': ',response) 59 | result.append(task_data) 60 | continue 61 | 62 | 63 | 64 | try: 65 | 66 | if task_data['task'] in tasks_exist: 67 | question = question + '\n' + context 68 | else: 69 | question = context + '\n' + question 70 | question = question + '\nPlease answer the option directly like A,B,C,D...' 71 | tmp.append(question) 72 | response = model.generate(tmp) 73 | task_data[modelname] = response 74 | print(modelname, task,len(tmp), ': ',response) 75 | except: 76 | response = 'model error or image error' 77 | task_data[modelname] = response 78 | print(modelname, task,len(tmp),': ',response) 79 | result.append(task_data) 80 | 81 | 82 | 83 | with open(output_path,'w') as f: 84 | json.dump(result,f) 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /VLMEvalKit/utils/__pycache__/conversation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/utils/__pycache__/conversation.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/utils/tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/utils/tools.py -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | pass 5 | 6 | from .smp import * 7 | from .api import * 8 | from .evaluate import * 9 | from .utils import * 10 | from .vlm import * 11 | from .config import * 12 | from .tools import cli 13 | 14 | load_env() 15 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/__pycache__/tools.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/__pycache__/tools.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal 3 | from .hf_chat_model import HFChatModel 4 | from .gemini import GeminiWrapper, GeminiProVision 5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI 6 | from .qwen_api import QwenAPI 7 | from .stepai import Step1V_INT 8 | from .claude import Claude_Wrapper, Claude3V 9 | from .reka import Reka 10 | from .glm_vision import GLMVisionAPI 11 | 12 | __all__ = [ 13 | 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper', 14 | 'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 15 | 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI' 16 | ] 17 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/base.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/base.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/claude.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/claude.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/gemini.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gemini.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/glm_vision.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/glm_vision.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/gpt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gpt.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/gpt_int.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/gpt_int.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/hf_chat_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/hf_chat_model.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/qwen_api.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/qwen_api.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/qwen_vl_api.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/qwen_vl_api.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/reka.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/reka.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__pycache__/stepai.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/api/__pycache__/stepai.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/claude.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import base64 5 | import mimetypes 6 | 7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat' 8 | headers = { 9 | 'alles-apin-token': '', 10 | 'Content-Type': 'application/json' 11 | } 12 | 13 | 14 | class Claude_Wrapper(BaseAPI): 15 | 16 | is_api: bool = True 17 | 18 | def __init__(self, 19 | model: str = 'claude-3-opus-20240229', 20 | key: str = None, 21 | retry: int = 10, 22 | wait: int = 3, 23 | system_prompt: str = None, 24 | verbose: bool = True, 25 | temperature: float = 0, 26 | max_tokens: int = 1024, 27 | **kwargs): 28 | 29 | self.model = model 30 | self.headers = headers 31 | self.temperature = temperature 32 | self.max_tokens = max_tokens 33 | if key is not None: 34 | self.key = key 35 | else: 36 | self.key = os.environ.get('ALLES', '') 37 | self.headers['alles-apin-token'] = self.key 38 | 39 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 40 | 41 | def build_msgs(self, msgs_raw): 42 | 43 | messages = [] 44 | message = {'role': 'user', 'content': []} 45 | for msg in msgs_raw: 46 | if msg['type'] == 'image': 47 | pth = msg['value'] 48 | suffix = osp.splitext(pth)[-1].lower() 49 | media_type = mimetypes.types_map.get(suffix, None) 50 | assert media_type is not None 51 | 52 | item = { 53 | 'type': 'image', 54 | 'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)} 55 | } 56 | 57 | elif msg['type'] == 'text': 58 | item = {'type': 'text', 'text': msg['value']} 59 | else: 60 | raise NotImplementedError(f'Unsupported message type: {msg["type"]}') 61 | 62 | message['content'].append(item) 63 | messages.append(message) 64 | return messages 65 | 66 | def generate_inner(self, inputs, **kwargs) -> str: 67 | 68 | payload = json.dumps({ 69 | 'model': self.model, 70 | 'max_tokens': self.max_tokens, 71 | 'messages': self.build_msgs(msgs_raw=inputs), 72 | **kwargs 73 | }) 74 | response = requests.request('POST', url, headers=headers, data=payload) 75 | 76 | ret_code = response.status_code 77 | retry = self.retry 78 | while ret_code == 429 and retry > 0: 79 | sleep(15) 80 | response = requests.request('POST', url, headers=headers, data=payload) 81 | ret_code = response.status_code 82 | retry -= 1 83 | 84 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 85 | answer = self.fail_msg 86 | 87 | try: 88 | resp_struct = json.loads(response.text) 89 | answer = resp_struct['data']['content'][0]['text'].strip() 90 | except: 91 | pass 92 | 93 | return ret_code, answer, response 94 | 95 | 96 | class Claude3V(Claude_Wrapper): 97 | 98 | def generate(self, message, dataset=None): 99 | return super(Claude_Wrapper, self).generate(message) 100 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/gemini.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | headers = 'Content-Type: application/json' 5 | 6 | 7 | class GeminiWrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | 11 | def __init__(self, 12 | model: str = 'gemini-1.0-pro', 13 | retry: int = 5, 14 | wait: int = 5, 15 | key: str = None, 16 | verbose: bool = True, 17 | temperature: float = 0.0, 18 | system_prompt: str = None, 19 | max_tokens: int = 1024, 20 | proxy: str = None, 21 | backend='genai', 22 | project_id='vlmeval', 23 | **kwargs): 24 | 25 | assert model in ['gemini-1.0-pro', 'gemini-1.5-pro'] 26 | 27 | self.model = model 28 | self.fail_msg = 'Failed to obtain answer via API. ' 29 | self.max_tokens = max_tokens 30 | self.temperature = temperature 31 | if key is None: 32 | key = os.environ.get('GOOGLE_API_KEY', None) 33 | # Try to load backend from environment variable 34 | be = os.environ.get('GOOGLE_API_BACKEND', None) 35 | if be is not None and be in ['genai', 'vertex']: 36 | backend = be 37 | 38 | assert backend in ['genai', 'vertex'] 39 | if backend == 'genai': 40 | assert model == 'gemini-1.0-pro' 41 | 42 | self.backend = backend 43 | self.project_id = project_id 44 | 45 | assert key is not None 46 | self.api_key = key 47 | if proxy is not None: 48 | proxy_set(proxy) 49 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 50 | 51 | def build_msgs_genai(self, inputs): 52 | messages = [] if self.system_prompt is None else [self.system_prompt] 53 | for inp in inputs: 54 | if inp['type'] == 'text': 55 | messages.append(inp['value']) 56 | elif inp['type'] == 'image': 57 | messages.append(Image.open(inp['value'])) 58 | return messages 59 | 60 | def build_msgs_vertex(self, inputs): 61 | from vertexai.generative_models import Part, Image 62 | messages = [] if self.system_prompt is None else [self.system_prompt] 63 | for inp in inputs: 64 | if inp['type'] == 'text': 65 | messages.append(inp['value']) 66 | elif inp['type'] == 'image': 67 | messages.append(Part.from_image(Image.load_from_file(inp['value']))) 68 | return messages 69 | 70 | def generate_inner(self, inputs, **kwargs) -> str: 71 | if self.backend == 'genai': 72 | import google.generativeai as genai 73 | assert isinstance(inputs, list) 74 | pure_text = np.all([x['type'] == 'text' for x in inputs]) 75 | genai.configure(api_key=self.api_key) 76 | model = genai.GenerativeModel('gemini-pro') if pure_text else genai.GenerativeModel('gemini-pro-vision') 77 | messages = self.build_msgs_genai(inputs) 78 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature) 79 | gen_config.update(kwargs) 80 | try: 81 | answer = model.generate_content( 82 | messages, 83 | generation_config=genai.types.GenerationConfig(**gen_config)).text 84 | return 0, answer, 'Succeeded! ' 85 | except Exception as err: 86 | if self.verbose: 87 | self.logger.error(err) 88 | self.logger.error(f'The input messages are {inputs}.') 89 | 90 | return -1, '', '' 91 | elif self.backend == 'vertex': 92 | import vertexai 93 | from vertexai.generative_models import GenerativeModel 94 | vertexai.init(project=self.project_id, location='us-central1') 95 | model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else 'gemini-1.5-pro' 96 | model = GenerativeModel(model_name=model_name) 97 | messages = self.build_msgs_vertex(inputs) 98 | try: 99 | resp = model.generate_content(messages) 100 | answer = resp.text 101 | return 0, answer, 'Succeeded! ' 102 | except Exception as err: 103 | if self.verbose: 104 | self.logger.error(err) 105 | self.logger.error(f'The input messages are {inputs}.') 106 | 107 | return -1, '', '' 108 | 109 | 110 | class GeminiProVision(GeminiWrapper): 111 | 112 | def generate(self, message, dataset=None): 113 | return super(GeminiProVision, self).generate(message) 114 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/glm_vision.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from vlmeval.utils.dataset import DATASET_TYPE 4 | from vlmeval.smp.vlm import encode_image_file_to_base64 5 | 6 | 7 | class GLMVisionWrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | 11 | def __init__(self, 12 | model: str, 13 | retry: int = 5, 14 | wait: int = 5, 15 | key: str = None, 16 | verbose: bool = True, 17 | system_prompt: str = None, 18 | max_tokens: int = 1024, 19 | proxy: str = None, 20 | **kwargs): 21 | 22 | self.model = model 23 | self.fail_msg = 'Failed to obtain answer via API. ' 24 | self.default_params = { 25 | 'top_p': 0.6, 26 | 'top_k': 2, 27 | 'temperature': 0.8, 28 | 'repetition_penalty': 1.1, 29 | 'best_of': 1, 30 | 'do_sample': True, 31 | 'stream': False, 32 | 'max_tokens': max_tokens 33 | } 34 | if key is None: 35 | key = os.environ.get('GLMV_API_KEY', None) 36 | assert key is not None, ( 37 | 'Please set the API Key (obtain it here: ' 38 | 'https://open.bigmodel.cn/dev/howuse/introduction)' 39 | ) 40 | self.key = key 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | def image_to_base64(self, image_path): 44 | import base64 45 | with open(image_path, 'rb') as image_file: 46 | encoded_string = base64.b64encode(image_file.read()) 47 | return encoded_string.decode('utf-8') 48 | 49 | def build_msgs(self, msgs_raw, system_prompt=None, dataset=None): 50 | msgs = cp.deepcopy(msgs_raw) 51 | content = [] 52 | text = '' 53 | for i, msg in enumerate(msgs): 54 | if msg['type'] == 'text': 55 | text += msg['value'] 56 | elif msg['type'] == 'image': 57 | content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value'])))) 58 | if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 59 | text += '\nShort Answer.' 60 | content.append(dict(type='text', text=text)) 61 | ret = [dict(role='user', content=content)] 62 | return ret 63 | 64 | def generate_inner(self, inputs, **kwargs) -> str: 65 | assert isinstance(inputs, str) or isinstance(inputs, list) 66 | inputs = [inputs] if isinstance(inputs, str) else inputs 67 | 68 | messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None)) 69 | 70 | url = 'https://api.chatglm.cn/v1/chat/completions' 71 | headers = { 72 | 'Content-Type': 'application/json', 73 | 'Request-Id': 'remote-test', 74 | 'Authorization': f'Bearer {self.key}' 75 | } 76 | payload = { 77 | 'model': self.model, 78 | 'messages': messages, 79 | **self.default_params 80 | } 81 | response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False) 82 | output = [] 83 | try: 84 | assert response.status_code == 200 85 | for line in response.iter_lines(): 86 | data = json.loads(line.decode('utf-8').lstrip('data: ')) 87 | output.append(data['choices'][0]['message']['content']) 88 | answer = ''.join(output).replace('', '') 89 | if self.verbose: 90 | self.logger.info(f'inputs: {inputs}\nanswer: {answer}') 91 | return 0, answer, 'Succeeded! ' 92 | except Exception as err: 93 | if self.verbose: 94 | self.logger.error(err) 95 | self.logger.error(f'The input messages are {inputs}.') 96 | return -1, self.fail_msg, '' 97 | 98 | 99 | class GLMVisionAPI(GLMVisionWrapper): 100 | 101 | def generate(self, message, dataset=None): 102 | return super(GLMVisionAPI, self).generate(message, dataset=dataset) 103 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/gpt_int.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | import requests 4 | from ..smp import * 5 | from .gpt import GPT_context_window, OpenAIWrapper 6 | 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat' 8 | headers = { 9 | 'Content-Type': 'application/json' 10 | } 11 | 12 | 13 | class OpenAIWrapperInternal(OpenAIWrapper): 14 | 15 | is_api: bool = True 16 | 17 | def __init__(self, 18 | model: str = 'gpt-3.5-turbo-0613', 19 | retry: int = 5, 20 | wait: int = 3, 21 | verbose: bool = True, 22 | system_prompt: str = None, 23 | temperature: float = 0, 24 | timeout: int = 60, 25 | max_tokens: int = 1024, 26 | img_size: int = 512, 27 | img_detail: str = 'low', 28 | **kwargs): 29 | 30 | self.model = model 31 | if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']): 32 | keys = load(os.environ['KEYS']) 33 | headers['alles-apin-token'] = keys.get('alles-apin-token', '') 34 | elif 'ALLES' in os.environ: 35 | headers['alles-apin-token'] = os.environ['ALLES'] 36 | self.headers = headers 37 | self.temperature = temperature 38 | self.timeout = timeout 39 | self.max_tokens = max_tokens 40 | 41 | assert img_size > 0 or img_size == -1 42 | self.img_size = img_size 43 | assert img_detail in ['high', 'low'] 44 | self.img_detail = img_detail 45 | 46 | super(OpenAIWrapper, self).__init__( 47 | wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | def generate_inner(self, inputs, **kwargs) -> str: 50 | input_msgs = self.prepare_inputs(inputs) 51 | 52 | temperature = kwargs.pop('temperature', self.temperature) 53 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 54 | 55 | # Held out 100 tokens as buffer 56 | context_window = GPT_context_window(self.model) 57 | max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) 58 | if 0 < max_tokens <= 100: 59 | print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ') 60 | if max_tokens <= 0: 61 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 62 | 63 | payload = dict( 64 | model=self.model, 65 | messages=input_msgs, 66 | max_tokens=max_tokens, 67 | n=1, 68 | stop=None, 69 | timeout=self.timeout, 70 | temperature=temperature, 71 | **kwargs) 72 | 73 | response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 74 | ret_code = response.status_code 75 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 76 | 77 | answer = self.fail_msg 78 | try: 79 | resp_struct = json.loads(response.text) 80 | assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct 81 | answer = resp_struct['data']['choices'][0]['message']['content'].strip() 82 | except: 83 | pass 84 | return ret_code, answer, response 85 | 86 | 87 | class GPT4V_Internal(OpenAIWrapperInternal): 88 | 89 | def generate(self, message, dataset=None): 90 | return super(GPT4V_Internal, self).generate(message) 91 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/qwen_api.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | import os 3 | from vlmeval.api.base import BaseAPI 4 | from vlmeval.smp import * 5 | 6 | 7 | # Note: This is a pure language model API. 8 | class QwenAPI(BaseAPI): 9 | 10 | is_api: bool = True 11 | 12 | def __init__(self, 13 | model: str = 'qwen-max-1201', 14 | retry: int = 5, 15 | wait: int = 5, 16 | verbose: bool = True, 17 | seed: int = 2680, 18 | temperature: float = 0.0, 19 | system_prompt: str = None, 20 | key: str = None, 21 | max_tokens: int = 1024, 22 | proxy: str = None, 23 | **kwargs): 24 | 25 | assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext'] 26 | self.model = model 27 | import dashscope 28 | self.fail_msg = 'Failed to obtain answer via API. ' 29 | self.max_tokens = max_tokens 30 | self.temperature = temperature 31 | self.seed = seed 32 | if key is None: 33 | key = os.environ.get('DASHSCOPE_API_KEY', None) 34 | assert key is not None, ( 35 | 'Please set the API Key (obtain it here: ' 36 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 37 | ) 38 | dashscope.api_key = key 39 | if proxy is not None: 40 | proxy_set(proxy) 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | @staticmethod 44 | def build_msgs(msgs_raw, system_prompt=None): 45 | msgs = cp.deepcopy(msgs_raw) 46 | ret = [] 47 | if system_prompt is not None: 48 | ret.append(dict(role='system', content=system_prompt)) 49 | for i, msg in enumerate(msgs): 50 | role = 'user' if i % 2 == 0 else 'assistant' 51 | ret.append(dict(role=role, content=msg)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | inputs = [inputs] if isinstance(inputs, str) else inputs 58 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 59 | 60 | import dashscope 61 | response = dashscope.Generation.call( 62 | model=self.model, 63 | messages=messages, 64 | seed=self.seed, 65 | temperature=self.temperature, 66 | max_tokens=self.max_tokens, 67 | result_format='message', # set the result to be "message" format. 68 | ) 69 | if response.status_code != HTTPStatus.OK: 70 | return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. ' 71 | 72 | try: 73 | return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! ' 74 | except Exception as err: 75 | return -1, f'Error: Failed to parse the response. {err}', response 76 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/qwen_vl_api.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | 5 | class QwenVLWrapper(BaseAPI): 6 | 7 | is_api: bool = True 8 | 9 | def __init__(self, 10 | model: str = 'qwen-vl-plus', 11 | retry: int = 5, 12 | wait: int = 5, 13 | key: str = None, 14 | verbose: bool = True, 15 | temperature: float = 0.0, 16 | system_prompt: str = None, 17 | max_tokens: int = 1024, 18 | proxy: str = None, 19 | **kwargs): 20 | 21 | assert model in ['qwen-vl-plus', 'qwen-vl-max'] 22 | self.model = model 23 | import dashscope 24 | self.fail_msg = 'Failed to obtain answer via API. ' 25 | self.max_tokens = max_tokens 26 | self.temperature = temperature 27 | if key is None: 28 | key = os.environ.get('DASHSCOPE_API_KEY', None) 29 | assert key is not None, ( 30 | 'Please set the API Key (obtain it here: ' 31 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 32 | ) 33 | dashscope.api_key = key 34 | if proxy is not None: 35 | proxy_set(proxy) 36 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 37 | 38 | @staticmethod 39 | def build_msgs(msgs_raw, system_prompt=None): 40 | msgs = cp.deepcopy(msgs_raw) 41 | ret = [] 42 | if system_prompt is not None: 43 | content = list(dict(text=system_prompt)) 44 | ret.append(dict(role='system', content=content)) 45 | content = [] 46 | for msg in msgs: 47 | if msg['type'] == 'text': 48 | content.append(dict(text=msg['value'])) 49 | elif msg['type'] == 'image': 50 | content.append(dict(image='file://' + msg['value'])) 51 | ret.append(dict(role='user', content=content)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | pure_text = np.all([x['type'] == 'text' for x in inputs]) 58 | assert not pure_text 59 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 60 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature) 61 | gen_config.update(kwargs) 62 | try: 63 | response = MultiModalConversation.call(model=self.model, messages=messages) 64 | if self.verbose: 65 | print(response) 66 | answer = response.output.choices[0]['message']['content'][0]['text'] 67 | return 0, answer, 'Succeeded! ' 68 | except Exception as err: 69 | if self.verbose: 70 | self.logger.error(err) 71 | self.logger.error(f'The input messages are {inputs}.') 72 | 73 | return -1, '', '' 74 | 75 | 76 | class QwenVLAPI(QwenVLWrapper): 77 | 78 | def generate(self, message, dataset=None): 79 | return super(QwenVLAPI, self).generate(message) 80 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/reka.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import mimetypes 5 | 6 | 7 | class Reka_Wrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | INTERLEAVE: bool = False 11 | 12 | def __init__(self, 13 | model: str = 'reka-flash-20240226', 14 | key: str = None, 15 | retry: int = 10, 16 | wait: int = 3, 17 | system_prompt: str = None, 18 | verbose: bool = True, 19 | temperature: float = 0, 20 | max_tokens: int = 1024, 21 | **kwargs): 22 | 23 | try: 24 | import reka 25 | except ImportError: 26 | raise ImportError('Please install reka by running "pip install reka-api"') 27 | 28 | self.model = model 29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) 30 | default_kwargs.update(kwargs) 31 | self.kwargs = default_kwargs 32 | if key is not None: 33 | self.key = key 34 | else: 35 | self.key = os.environ.get('REKA_API_KEY', '') 36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 37 | 38 | def generate_inner(self, inputs, **kwargs) -> str: 39 | import reka 40 | reka.API_KEY = self.key 41 | prompt, image_path = self.message_to_promptimg(inputs) 42 | image_b64 = encode_image_file_to_base64(image_path) 43 | 44 | response = reka.chat( 45 | model_name=self.model, 46 | human=prompt, 47 | media_url=f'data:image/jpeg;base64,{image_b64}', 48 | **self.kwargs) 49 | 50 | try: 51 | return 0, response['text'], response 52 | except: 53 | return -1, self.fail_msg, response 54 | 55 | 56 | class Reka(Reka_Wrapper): 57 | 58 | def generate(self, message, dataset=None): 59 | return super(Reka_Wrapper, self).generate(message) 60 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/stepai.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | url = 'https://api.stepfun.com/v1/chat/completions' 5 | headers = { 6 | 'Content-Type': 'application/json', 7 | 'Authorization': 'Bearer {}', 8 | } 9 | 10 | 11 | class StepAPI_INT(BaseAPI): 12 | 13 | is_api: bool = True 14 | 15 | def __init__(self, 16 | model: str = 'step-1v-8k', 17 | retry: int = 10, 18 | wait: int = 3, 19 | key: str = None, 20 | temperature: float = 0, 21 | max_tokens: int = 300, 22 | verbose: bool = True, 23 | system_prompt: str = None, 24 | **kwargs): 25 | self.model = model 26 | self.fail_msg = 'Fail to obtain answer via API.' 27 | self.headers = headers 28 | self.temperature = temperature 29 | self.max_tokens = max_tokens 30 | self.system_prompt = system_prompt 31 | if key is not None: 32 | self.key = key 33 | else: 34 | self.key = os.environ.get('STEPAI_API_KEY', '') 35 | headers['Authorization'] = headers['Authorization'].format(self.key) 36 | 37 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 38 | 39 | @staticmethod 40 | def build_msgs(msgs_raw): 41 | messages = [] 42 | message = {'role': 'user', 'content': []} 43 | 44 | for msg in msgs_raw: 45 | if msg['type'] == 'image': 46 | image_b64 = encode_image_file_to_base64(msg['value']) 47 | message['content'].append({ 48 | 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)}, 49 | 'type': 'image_url' 50 | }) 51 | elif msg['type'] == 'text': 52 | message['content'].append({ 53 | 'text': msg['value'], 54 | 'type': 'text' 55 | }) 56 | 57 | messages.append(message) 58 | return messages 59 | 60 | def generate_inner(self, inputs, **kwargs) -> str: 61 | print(inputs, '\n') 62 | payload = dict( 63 | model=self.model, 64 | max_tokens=self.max_tokens, 65 | temperature=self.temperature, 66 | messages=self.build_msgs(msgs_raw=inputs), 67 | **kwargs) 68 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 69 | ret_code = response.status_code 70 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 71 | 72 | answer = self.fail_msg 73 | try: 74 | resp_struct = json.loads(response.text) 75 | answer = resp_struct['choices'][0]['message']['content'].strip() 76 | except: 77 | pass 78 | return ret_code, answer, response 79 | 80 | 81 | class Step1V_INT(StepAPI_INT): 82 | 83 | def generate(self, message, dataset=None): 84 | return super(StepAPI_INT, self).generate(message) 85 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/OCRBench.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | 3 | 4 | def OCRBench_eval(eval_file): 5 | OCRBench_score = { 6 | 'Regular Text Recognition': 0, 7 | 'Irregular Text Recognition': 0, 8 | 'Artistic Text Recognition': 0, 9 | 'Handwriting Recognition': 0, 10 | 'Digit String Recognition': 0, 11 | 'Non-Semantic Text Recognition': 0, 12 | 'Scene Text-centric VQA': 0, 13 | 'Doc-oriented VQA': 0, 14 | 'Key Information Extraction': 0, 15 | 'Handwritten Mathematical Expression Recognition': 0 16 | } 17 | 18 | logger = get_logger('Evaluation') 19 | 20 | data = load(eval_file) 21 | lt = len(data) 22 | lines = [data.iloc[i] for i in range(lt)] 23 | for i in tqdm(range(len(lines))): 24 | line = lines[i] 25 | predict = str(line['prediction']) 26 | answers = eval(line['answer']) 27 | category = line['category'] 28 | if category == 'Handwritten Mathematical Expression Recognition': 29 | for j in range(len(answers)): 30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '') 31 | predict = predict.strip().replace('\n', ' ').replace(' ', '') 32 | if answer in predict: 33 | OCRBench_score[category] += 1 34 | break 35 | else: 36 | for j in range(len(answers)): 37 | answer = answers[j].lower().strip().replace('\n', ' ') 38 | predict = predict.lower().strip().replace('\n', ' ') 39 | if answer in predict: 40 | OCRBench_score[category] += 1 41 | break 42 | 43 | final_score_dict = {} 44 | final_score_dict['Text Recognition'] = ( 45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] 46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] 47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'] 48 | ) 49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] 50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] 51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] 52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \ 53 | OCRBench_score['Handwritten Mathematical Expression Recognition'] 54 | final_score_dict['Final Score'] = ( 55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] 56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] 57 | + final_score_dict['Handwritten Mathematical Expression Recognition'] 58 | ) 59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 60 | score_pth = eval_file.replace('.xlsx', '_score.json') 61 | dump(final_score_dict, score_pth) 62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 63 | logger.info('Score: ') 64 | for key, value in final_score_dict.items(): 65 | logger.info('{}:{}'.format(key, value)) 66 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval 2 | from .mmvet_eval import MMVet_eval 3 | from .multiple_choice import multiple_choice_eval 4 | from .coco_eval import COCO_eval 5 | from .vqa_eval import VQAEval 6 | from .mathvista_eval import MathVista_eval 7 | from .llavabench import LLaVABench_eval 8 | from .misc import build_judge 9 | from .OCRBench import OCRBench_eval 10 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/OCRBench.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/OCRBench.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/coco_eval.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/coco_eval.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/llavabench.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/llavabench.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/mathvista_eval.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/mathvista_eval.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/misc.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/misc.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/mmvet_eval.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/mmvet_eval.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/multiple_choice.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/multiple_choice.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/vqa_eval.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/vqa_eval.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/__pycache__/yes_or_no.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/evaluate/__pycache__/yes_or_no.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/coco_eval.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from pycocoevalcap.bleu.bleu import Bleu 3 | from pycocoevalcap.rouge.rouge import Rouge 4 | from pycocoevalcap.cider.cider import Cider 5 | 6 | 7 | class COCO_Caption_Scorer(): 8 | def __init__(self, ref, gt): 9 | self.ref = ref 10 | self.gt = gt 11 | print('setting up scorers...') 12 | self.scorers = [ 13 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), 14 | # (Meteor(), "METEOR"), # need java version 11.0.16+ 15 | (Rouge(), 'ROUGE_L'), 16 | (Cider(), 'CIDEr'), 17 | # (Spice(), "SPICE"), # need java version 11.0.16+ 18 | ] 19 | 20 | def compute_scores(self): 21 | total_scores = {} 22 | for scorer, method in self.scorers: 23 | print('computing %s score...' % (scorer.method())) 24 | score, scores = scorer.compute_score(self.gt, self.ref) 25 | if type(method) == list: 26 | for sc, scs, m in zip(score, scores, method): 27 | print('%s: %0.3f' % (m, sc * 100)) 28 | total_scores['Bleu'] = [x * 100 for x in score] 29 | else: 30 | print('%s: %0.3f' % (method, score * 100)) 31 | total_scores[method] = score * 100 32 | 33 | print('*****DONE*****') 34 | for key, value in total_scores.items(): 35 | print('{}:{}'.format(key, value)) 36 | return total_scores 37 | 38 | 39 | def COCO_eval(eval_file, nproc=4, verbose=False): 40 | logger = get_logger('Evaluation') 41 | 42 | data = load(eval_file) 43 | 44 | lt = len(data) 45 | lines = [data.iloc[i] for i in range(lt)] 46 | ref = {} 47 | gt = {} 48 | for i, line in enumerate(lines): 49 | ref[str(i)] = [str(line['prediction'])] 50 | gt[str(i)] = eval(line['answer']) 51 | 52 | scorer = COCO_Caption_Scorer(ref, gt) 53 | coco_caption_score_dict = scorer.compute_scores() 54 | 55 | score_pth = eval_file.replace('.xlsx', '_score.json') 56 | dump(coco_caption_score_dict, score_pth) 57 | logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 58 | logger.info('Score: ') 59 | for key, value in coco_caption_score_dict.items(): 60 | logger.info('{}:{}'.format(key, value)) 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser(description='Inference LLM Answers. ') 65 | parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ') 66 | parser.add_argument('--nproc', type=int, default=4) 67 | parser.add_argument('--verbose', action='store_true') 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose) 75 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/evaluate/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal 3 | from vlmeval.smp import load_env 4 | 5 | INTERNAL = os.environ.get('INTERNAL', 0) 6 | 7 | 8 | def build_judge(**kwargs): 9 | model = kwargs.pop('model', None) 10 | load_env() 11 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 12 | if LOCAL_LLM is None: 13 | model_map = { 14 | 'gpt-4-turbo': 'gpt-4-1106-preview', 15 | 'gpt-4-0613': 'gpt-4-0613', 16 | 'gpt-4-0314': 'gpt-4-0314', 17 | 'gpt-4-0125': 'gpt-4-0125-preview', 18 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 19 | 'chatgpt-0613': 'gpt-3.5-turbo-0613', 20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125' 21 | } 22 | model_version = model_map[model] 23 | else: 24 | model_version = LOCAL_LLM 25 | if INTERNAL: 26 | model = OpenAIWrapperInternal(model_version, **kwargs) 27 | else: 28 | model = OpenAIWrapper(model_version, **kwargs) 29 | return model 30 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__pycache__/file.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/file.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__pycache__/log.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/log.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__pycache__/misc.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/misc.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__pycache__/vlm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/smp/__pycache__/vlm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 7 | logger = logging.getLogger(name) 8 | if name in logger_initialized: 9 | return logger 10 | 11 | for logger_name in logger_initialized: 12 | if name.startswith(logger_name): 13 | return logger 14 | 15 | stream_handler = logging.StreamHandler() 16 | handlers = [stream_handler] 17 | 18 | try: 19 | import torch.distributed as dist 20 | if dist.is_available() and dist.is_initialized(): 21 | rank = dist.get_rank() 22 | else: 23 | rank = 0 24 | except ImportError: 25 | rank = 0 26 | 27 | if rank == 0 and log_file is not None: 28 | file_handler = logging.FileHandler(log_file, file_mode) 29 | handlers.append(file_handler) 30 | 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 33 | for handler in handlers: 34 | handler.setFormatter(formatter) 35 | handler.setLevel(log_level) 36 | logger.addHandler(handler) 37 | 38 | if rank == 0: 39 | logger.setLevel(log_level) 40 | else: 41 | logger.setLevel(logging.ERROR) 42 | 43 | logger_initialized[name] = True 44 | return logger 45 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/misc.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401, F403 2 | import abc 3 | import argparse 4 | import csv 5 | import multiprocessing as mp 6 | import os 7 | import os.path as osp 8 | import copy as cp 9 | import random as rd 10 | import requests 11 | import shutil 12 | import subprocess 13 | import warnings 14 | import logging 15 | import pandas as pd 16 | from collections import OrderedDict, defaultdict 17 | from multiprocessing import Pool, current_process 18 | from tqdm import tqdm 19 | import datetime 20 | import matplotlib.pyplot as plt 21 | import seaborn as sns 22 | from tabulate import tabulate_formats, tabulate 23 | from huggingface_hub import scan_cache_dir 24 | from sty import fg, bg, ef, rs 25 | 26 | def process_punctuation(inText): 27 | import re 28 | outText = inText 29 | punct = [ 30 | ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', 31 | '>', '<', '@', '`', ',', '?', '!' 32 | ] 33 | commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605 34 | periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605 35 | for p in punct: 36 | if (p + ' ' in inText or ' ' + p in inText) or (re.search( 37 | commaStrip, inText) is not None): 38 | outText = outText.replace(p, '') 39 | else: 40 | outText = outText.replace(p, ' ') 41 | outText = periodStrip.sub('', outText, re.UNICODE) 42 | return outText 43 | 44 | def h2r(value): 45 | if value[0] == '#': 46 | value = value[1:] 47 | assert len(value) == 6 48 | return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2)) 49 | 50 | def r2h(rgb): 51 | return '#%02x%02x%02x' % rgb 52 | 53 | def colored(s, color): 54 | if isinstance(color, str): 55 | if hasattr(fg, color): 56 | return getattr(fg, color) + s + fg.rs 57 | color = h2r(color) 58 | return fg(*color) + s + fg.rs 59 | 60 | def istype(s, type): 61 | if isinstance(s, type): 62 | return True 63 | try: 64 | return isinstance(eval(s), type) 65 | except Exception as _: 66 | return False 67 | 68 | def bincount(lst): 69 | bins = defaultdict(lambda: 0) 70 | for item in lst: 71 | bins[item] += 1 72 | return bins 73 | 74 | def get_cache_path(repo_id): 75 | hf_cache_info = scan_cache_dir() 76 | repos = list(hf_cache_info.repos) 77 | repo = None 78 | for r in repos: 79 | if r.repo_id == repo_id: 80 | repo = r 81 | break 82 | if repo is None: 83 | return None 84 | revs = list(repo.revisions) 85 | rev2keep, last_modified = None, 0 86 | for rev in revs: 87 | if rev.last_modified > last_modified: 88 | rev2keep, last_modified = rev, rev.last_modified 89 | if rev2keep is None: 90 | return None 91 | return str(rev2keep.snapshot_path) 92 | 93 | def proxy_set(s): 94 | import os 95 | for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']: 96 | os.environ[key] = s 97 | 98 | def get_rank_and_world_size(): 99 | rank = int(os.environ.get('RANK', 0)) 100 | world_size = int(os.environ.get('WORLD_SIZE', 1)) 101 | return rank, world_size 102 | 103 | def splitlen(s, sym='/'): 104 | return len(s.split(sym)) 105 | 106 | def listinstr(lst, s): 107 | assert isinstance(lst, list) 108 | for item in lst: 109 | if item in s: 110 | return True 111 | return False 112 | 113 | def d2df(D): 114 | return pd.DataFrame({x: [D[x]] for x in D}) 115 | 116 | def cn_string(s): 117 | import re 118 | if re.search(u'[\u4e00-\u9fff]', s): 119 | return True 120 | return False 121 | 122 | try: 123 | import decord 124 | except ImportError: 125 | pass 126 | 127 | def timestr(second=True, minute=False): 128 | s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')[2:] 129 | if second: 130 | return s 131 | elif minute: 132 | return s[:-2] 133 | else: 134 | return s[:-4] 135 | 136 | def dict_merge(dct, merge_dct): 137 | for k, _ in merge_dct.items(): 138 | if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)): #noqa 139 | dict_merge(dct[k], merge_dct[k]) 140 | else: 141 | dct[k] = merge_dct[k] 142 | 143 | def youtube_dl(idx): 144 | cmd = f'youtube-dl -f best -f mp4 "{idx}" -o {idx}.mp4' 145 | os.system(cmd) 146 | 147 | def run_command(cmd): 148 | if isinstance(cmd, str): 149 | cmd = cmd.split() 150 | return subprocess.check_output(cmd).decode() 151 | 152 | def load_env(): 153 | logger = logging.getLogger('LOAD_ENV') 154 | try: 155 | import vlmeval 156 | except ImportError: 157 | logger.error('VLMEval is not installed. Failed to import environment variables from .env file. ') 158 | return 159 | pth = osp.realpath(vlmeval.__path__[0]) 160 | pth = osp.join(pth, '../.env') 161 | pth = osp.realpath(pth) 162 | if not osp.exists(pth): 163 | logger.error(f'Did not detect the .env file at {pth}, failed to load. ') 164 | return 165 | 166 | from dotenv import dotenv_values 167 | values = dotenv_values(pth) 168 | for k, v in values.items(): 169 | if v is not None and len(v): 170 | os.environ[k] = v 171 | logger.info(f'API Keys successfully loaded from {pth}') 172 | 173 | def pip_install_robust(package): 174 | import sys 175 | retry = 3 176 | while retry > 0: 177 | try: 178 | package_base = package.split('=')[0] 179 | module = __import__(package) 180 | return True 181 | except ImportError: 182 | subprocess.check_call([sys.executable, '-m', 'pip', 'install', package]) 183 | retry -= 1 184 | return False 185 | 186 | 187 | def version_cmp(v1, v2, op='eq'): 188 | from packaging import version 189 | import operator 190 | op_func = getattr(operator, op) 191 | return op_func(version.parse(v1), version.parse(v2)) 192 | 193 | 194 | def toliststr(s): 195 | if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'): 196 | return [str(x) for x in eval(s)] 197 | elif isinstance(s, str): 198 | return [s] 199 | elif isinstance(s, list): 200 | return [str(x) for x in s] 201 | raise NotImplementedError 202 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/vlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import pandas as pd 4 | import numpy as np 5 | import string 6 | from uuid import uuid4 7 | import os.path as osp 8 | import base64 9 | from PIL import Image 10 | from .file import load, dump 11 | Image.MAX_IMAGE_PIXELS = 1e9 12 | 13 | 14 | def mmqa_display(question, target_size=512): 15 | question = {k.lower(): v for k, v in question.items()} 16 | keys = list(question.keys()) 17 | keys = [k for k in keys if k not in ['index', 'image']] 18 | 19 | images = question['image'] 20 | if isinstance(images, str): 21 | images = [images] 22 | 23 | idx = question.pop('index', 'XXX') 24 | print(f'INDEX: {idx}') 25 | 26 | for im in images: 27 | image = decode_base64_to_image(im, target_size=target_size) 28 | display(image) # noqa: F821 29 | 30 | for k in keys: 31 | try: 32 | if not pd.isna(question[k]): 33 | print(f'{k.upper()}. {question[k]}') 34 | except ValueError: 35 | if False in pd.isna(question[k]): 36 | print(f'{k.upper()}. {question[k]}') 37 | 38 | 39 | def encode_image_to_base64(img, target_size=-1): 40 | # if target_size == -1, will not do resizing 41 | # else, will set the max_size ot (target_size, target_size) 42 | if img.mode in ('RGBA', 'P'): 43 | img = img.convert('RGB') 44 | tmp = osp.join('/tmp', str(uuid4()) + '.jpg') 45 | if target_size > 0: 46 | img.thumbnail((target_size, target_size)) 47 | img.save(tmp) 48 | with open(tmp, 'rb') as image_file: 49 | image_data = image_file.read() 50 | ret = base64.b64encode(image_data).decode('utf-8') 51 | os.remove(tmp) 52 | return ret 53 | 54 | 55 | def encode_image_file_to_base64(image_path, target_size=-1): 56 | image = Image.open(image_path) 57 | return encode_image_to_base64(image, target_size=target_size) 58 | 59 | 60 | def decode_base64_to_image(base64_string, target_size=-1): 61 | image_data = base64.b64decode(base64_string) 62 | image = Image.open(io.BytesIO(image_data)) 63 | if image.mode in ('RGBA', 'P'): 64 | image = image.convert('RGB') 65 | if target_size > 0: 66 | image.thumbnail((target_size, target_size)) 67 | return image 68 | 69 | 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1): 71 | image = decode_base64_to_image(base64_string, target_size=target_size) 72 | image.save(image_path) 73 | 74 | 75 | def build_option_str(option_dict): 76 | s = 'There are several options: \n' 77 | for c, content in option_dict.items(): 78 | if not pd.isna(content): 79 | s += f'{c}. {content}\n' 80 | return s 81 | 82 | 83 | def isimg(s): 84 | return osp.exists(s) or s.startswith('http') 85 | 86 | 87 | def read_ok(img_path): 88 | if not osp.exists(img_path): 89 | return False 90 | try: 91 | im = Image.open(img_path) 92 | assert im.size[0] > 0 and im.size[1] > 0 93 | return True 94 | except: 95 | return False 96 | 97 | 98 | def gpt_key_set(): 99 | openai_key = os.environ.get('OPENAI_API_KEY', None) 100 | return isinstance(openai_key, str) and openai_key.startswith('sk-') 101 | 102 | 103 | def apiok(wrapper): 104 | s = wrapper.generate('Hello!') 105 | return wrapper.fail_msg not in s 106 | 107 | 108 | def circular_pred(df, extract_func=None): 109 | if extract_func is None: 110 | extract_func = lambda x: x # noqa: E731 111 | df = df.sort_values('index') 112 | from vlmeval.utils import can_infer_option 113 | shift = int(1e6) 114 | 115 | choices = [extract_func(x) for x in df['prediction']] 116 | pred_map = {i: c for i, c in zip(df['index'], choices)} 117 | flag_map = {i: True for i in pred_map if i < 1e6} 118 | valid_map = {i: True for i in pred_map if i < 1e6} 119 | for i in df['index']: 120 | if i >= shift and pred_map[i] and pred_map[i - shift]: 121 | if ( 122 | pred_map[i] not in list(string.ascii_uppercase) or # noqa: W504 123 | pred_map[i - shift] not in list(string.ascii_uppercase) 124 | ): 125 | 126 | valid_map[i % shift] = False 127 | continue 128 | if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1: 129 | continue 130 | else: 131 | flag_map[i % shift] = False 132 | flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} 133 | flags = list(flag_map.values()) 134 | return np.mean(flags) 135 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text 2 | from .mp_util import track_progress_rich 3 | from .custom_prompt import CustomPrompt 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full 5 | from .dataset import TSVDataset, split_MMMU 6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer 7 | 8 | 9 | __all__ = [ 10 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 11 | 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt', 12 | 'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer' 13 | ] 14 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/custom_prompt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/custom_prompt.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/dataset.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/dataset_config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/dataset_config.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/mp_util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/mp_util.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__pycache__/result_transfer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/utils/__pycache__/result_transfer.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/custom_prompt.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | from .dataset_config import img_root_map 3 | from abc import abstractmethod 4 | 5 | 6 | class CustomPrompt: 7 | 8 | @abstractmethod 9 | def use_custom_prompt(self, dataset): 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def build_prompt(self, line, dataset): 14 | raise NotImplementedError 15 | 16 | def dump_image(self, line, dataset): 17 | ROOT = LMUDataRoot() 18 | assert isinstance(dataset, str) 19 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) 20 | os.makedirs(img_root, exist_ok=True) 21 | 22 | if 'image' in line: 23 | if isinstance(line['image'], list): 24 | tgt_path = [] 25 | assert 'image_path' in line 26 | for img, im_name in zip(line['image'], line['image_path']): 27 | path = osp.join(img_root, im_name) 28 | if not read_ok(path): 29 | decode_base64_to_image_file(img, path) 30 | tgt_path.append(path) 31 | else: 32 | tgt_path = osp.join(img_root, f"{line['index']}.jpg") 33 | if not read_ok(tgt_path): 34 | decode_base64_to_image_file(line['image'], tgt_path) 35 | tgt_path = [tgt_path] 36 | else: 37 | assert 'image_path' in line 38 | tgt_path = toliststr(line['image_path']) 39 | 40 | return tgt_path 41 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import hashlib 3 | from ..smp import * 4 | from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE 5 | from .custom_prompt import CustomPrompt 6 | 7 | 8 | def check_md5(data_path, dataset): 9 | if dataset not in dataset_md5_dict: 10 | warnings.warn(f'We do not have an md5 record for dataset {dataset}, skip the md5 check. ') 11 | return True 12 | assert osp.exists(data_path) 13 | with open(data_path, 'rb') as f: 14 | hash = hashlib.new('md5') 15 | for chunk in iter(lambda: f.read(2**20), b''): 16 | hash.update(chunk) 17 | if str(hash.hexdigest()) == dataset_md5_dict[dataset]: 18 | return True 19 | else: 20 | warnings.warn('this data file is incomplete, so it needs to be downloaded again.') 21 | return False 22 | 23 | 24 | def split_MMMU(msgs): 25 | text, images = None, [] 26 | for s in msgs: 27 | if s['type'] == 'image': 28 | images.append(s['value']) 29 | elif s['type'] == 'text': 30 | assert text is None 31 | text = s['value'] 32 | text_segs = text.split('' 38 | image_idx = int(seg[0]) - 1 39 | segs.append(dict(type='image', value=images[image_idx])) 40 | segs.append(dict(type='text', value=seg[2:])) 41 | return segs 42 | 43 | 44 | def prep_tsv(dataset): 45 | data_root = LMUDataRoot() 46 | assert osp.exists(data_root) 47 | update_flag = False 48 | 49 | if dataset in dataset_URLs: 50 | url = dataset_URLs[dataset] 51 | file_name = url.split('/')[-1] 52 | data_path = osp.join(data_root, file_name) 53 | 54 | if osp.exists(data_path) and check_md5(data_path, dataset): 55 | pass 56 | else: 57 | warnings.warn('The dataset tsv is not downloaded') 58 | download_file(url, data_path) 59 | update_flag = True 60 | else: 61 | data_path = osp.join(data_root, dataset + '.tsv') 62 | assert osp.exists(data_path) 63 | 64 | if file_size(data_path, 'GB') > 1: 65 | local_path = data_path.replace('.tsv', '_local.tsv') 66 | if not osp.exists(local_path) or update_flag or os.environ.get('FORCE_LOCAL', None): 67 | from ..tools import LOCALIZE 68 | LOCALIZE(data_path, local_path) 69 | return local_path 70 | else: 71 | return data_path 72 | 73 | 74 | class TSVDataset(CustomPrompt): 75 | 76 | def __init__(self, dataset='MMBench', skip_noimg=True): 77 | 78 | self.data_root = LMUDataRoot() 79 | self.dataset = dataset 80 | self.dataset_type = DATASET_TYPE(dataset) 81 | self.data_path = prep_tsv(dataset) 82 | data = load(self.data_path) 83 | 84 | self.skip_noimg = skip_noimg 85 | if skip_noimg and 'image' in data: 86 | data = data[~pd.isna(data['image'])] 87 | 88 | # Prompt for Captioning 89 | if listinstr(['COCO'], dataset): 90 | data['question'] = [( 91 | 'Please describe this image in general. Directly provide the description, ' 92 | 'do not include prefix like "This image depicts". ' 93 | )] * len(data) 94 | 95 | data['index'] = [str(x) for x in data['index']] 96 | 97 | self.meta_only = True 98 | if 'image' in data: 99 | data['image'] = [str(x) for x in data['image']] 100 | 101 | image_map = {x: y for x, y in zip(data['index'], data['image'])} 102 | for k in image_map: 103 | if len(image_map[k]) <= 64: 104 | idx = image_map[k] 105 | assert idx in image_map and len(image_map[idx]) > 64 106 | image_map[k] = image_map[idx] 107 | 108 | images = [toliststr(image_map[k]) for k in data['index']] 109 | data['image'] = [x[0] if len(x) == 1 else x for x in images] 110 | self.meta_only = False 111 | 112 | if 'image_path' in data: 113 | paths = [toliststr(x) for x in data['image_path']] 114 | data['image_path'] = [x[0] if len(x) == 1 else x for x in paths] 115 | 116 | if np.all([istype(x, int) for x in data['index']]): 117 | data['index'] = [int(x) for x in data['index']] 118 | 119 | self.data = data 120 | 121 | def __len__(self): 122 | return len(self.data) 123 | 124 | def build_prompt(self, line, dataset=None): 125 | if dataset is None: 126 | dataset = self.dataset 127 | 128 | if isinstance(line, int): 129 | line = self.data.iloc[line] 130 | 131 | if self.meta_only: 132 | tgt_path = toliststr(line['image_path']) 133 | else: 134 | tgt_path = self.dump_image(line, dataset) 135 | 136 | prompt = line['question'] 137 | if DATASET_TYPE(dataset) == 'multi-choice': 138 | question = line['question'] 139 | options = { 140 | cand: line[cand] 141 | for cand in string.ascii_uppercase 142 | if cand in line and not pd.isna(line[cand]) 143 | } 144 | options_prompt = 'Options:\n' 145 | for key, item in options.items(): 146 | options_prompt += f'{key}. {item}\n' 147 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 148 | prompt = '' 149 | if hint is not None: 150 | prompt += f'Hint: {hint}\n' 151 | prompt += f'Question: {question}\n' 152 | if len(options): 153 | prompt += options_prompt 154 | prompt += 'Please select the correct answer from the options above. \n' 155 | elif DATASET_TYPE(dataset) == 'VQA': 156 | if listinstr(['ocrvqa', 'textvqa', 'chartqa', 'docvqa'], dataset.lower()): 157 | prompt += '\nAnswer the question using a single word or phrase.\n' 158 | 159 | msgs = [] 160 | if isinstance(tgt_path, list): 161 | msgs.extend([dict(type='image', value=p) for p in tgt_path]) 162 | else: 163 | msgs = [dict(type='image', value=tgt_path)] 164 | msgs.append(dict(type='text', value=prompt)) 165 | 166 | return msgs 167 | 168 | def display(self, line): 169 | if isinstance(line, int): 170 | line = self.data.iloc[line] 171 | mmqa_display(line) 172 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/matching_util.py: -------------------------------------------------------------------------------- 1 | import string 2 | import copy as cp 3 | import os 4 | from ..smp import * 5 | 6 | 7 | def can_infer_option(answer, choices): 8 | verbose = os.environ.get('VERBOSE', 0) 9 | # Choices is a dictionary 10 | if 'Failed to obtain answer via API' in answer: 11 | return False 12 | 13 | reject_to_answer = [ 14 | "Sorry, I can't help with images of people yet.", 15 | "I can't process this file.", 16 | "I'm sorry, but without the image provided", 17 | 'Cannot determine the answer' 18 | ] 19 | for err in reject_to_answer: 20 | if err in answer: 21 | return 'Z' 22 | 23 | def count_choice(splits, choices, prefix='', suffix=''): 24 | cnt = 0 25 | for c in choices: 26 | if prefix + c + suffix in splits: 27 | cnt += 1 28 | return cnt 29 | 30 | answer_mod = cp.copy(answer) 31 | chars = '.()[],:;!*#{}' 32 | for c in chars: 33 | answer_mod = answer_mod.replace(c, ' ') 34 | 35 | splits = [x.strip() for x in answer_mod.split()] 36 | count = count_choice(splits, choices) 37 | 38 | if count == 1: 39 | for ch in choices: 40 | if 'A' in splits and len(splits) > 3 and verbose: 41 | logger = get_logger('Evaluation') 42 | logger.info(f'A might be a quantifier in the string: {answer}.') 43 | return False 44 | if ch in splits: 45 | return ch 46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1: 47 | return 'Z' 48 | return False 49 | 50 | 51 | def can_infer_text(answer, choices): 52 | answer = answer.lower() 53 | assert isinstance(choices, dict) 54 | for k in choices: 55 | assert k in string.ascii_uppercase 56 | choices[k] = str(choices[k]).lower() 57 | cands = [] 58 | for k in choices: 59 | if choices[k] in answer: 60 | cands.append(k) 61 | if len(cands) == 1: 62 | return cands[0] 63 | return False 64 | 65 | 66 | def can_infer(answer, choices): 67 | answer = str(answer) 68 | copt = can_infer_option(answer, choices) 69 | return copt if copt else can_infer_text(answer, choices) 70 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/result_transfer.py: -------------------------------------------------------------------------------- 1 | from ..evaluate.misc import build_judge 2 | from ..evaluate.multiple_choice import extract_answer_from_item 3 | 4 | from ..smp import * 5 | from .matching_util import can_infer 6 | from .mp_util import track_progress_rich 7 | 8 | 9 | def MMMU_result_transfer(result_path): 10 | res = {} 11 | result_data = load(result_path) 12 | mcq = result_data['A'].notna() 13 | lt = len(result_data) 14 | for i in range(lt): 15 | line = result_data.iloc[i] 16 | if mcq[i]: 17 | options = { 18 | cand: line[cand] 19 | for cand in string.ascii_uppercase 20 | if cand in line and not pd.isna(line[cand]) 21 | } 22 | prediction = line['prediction'] 23 | infer_prediction = can_infer(prediction, options) 24 | res[line['id']] = infer_prediction 25 | else: 26 | res[line['id']] = line['prediction'] 27 | result_json = result_path.replace('.xlsx', '.json') 28 | dump(res, result_json) 29 | return result_json 30 | 31 | 32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): 33 | logger = get_logger('Evaluation') 34 | INTERNAL = os.environ.get('INTERNAL', 0) 35 | nproc = judge_kwargs.pop('nproc', 4) 36 | 37 | rd.seed(2680) 38 | suffix = eval_file.split('.')[-1] 39 | model = judge_kwargs['model'] 40 | assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125'] 41 | name_str_map = { 42 | 'chatgpt-0613': 'openai', 43 | 'gpt-4-0125': 'gpt4' 44 | } 45 | name_str = name_str_map[model] if model in name_str_map else model 46 | 47 | if model == 'exact_matching': 48 | model = None 49 | else: 50 | if INTERNAL or gpt_key_set(): 51 | model = build_judge(**judge_kwargs) 52 | else: 53 | logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') 54 | model = None 55 | 56 | logger.info(f'Evaluating {eval_file}') 57 | result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') 58 | result = {} 59 | if osp.exists(result_file): 60 | result = load(result_file) 61 | 62 | data = load(eval_file) 63 | assert 'index' in data, 'Essentail columns missing in the eval_file.' 64 | 65 | data = data.sort_values(by='index') 66 | data['prediction'] = [str(x) for x in data['prediction']] 67 | for k in data.keys(): 68 | data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) 69 | 70 | idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))} 71 | idx2lines = {k: v for k, v in idx2lines.items() if k not in result} 72 | 73 | indices = list(idx2lines.keys()) 74 | lines = [idx2lines[i] for i in indices] 75 | tups = [(model, line) for line in lines] 76 | res = track_progress_rich( 77 | extract_answer_from_item, 78 | tups, 79 | nproc=nproc, 80 | chunksize=nproc, 81 | save=result_file, 82 | keys=indices) 83 | 84 | for i, r in zip(indices, res): 85 | if i in result: 86 | assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log'] 87 | else: 88 | result[i] = r 89 | 90 | indices = list(data['index']) 91 | data['opt'] = [result[i]['opt'] for i in data['index']] 92 | data['log'] = [result[i]['log'] for i in data['index']] 93 | 94 | # load split 95 | output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') 96 | dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) 97 | return output_path 98 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .base import BaseModel 6 | from .cogvlm import CogVlm, GLM4v 7 | from .emu import Emu 8 | from .idefics import IDEFICS, IDEFICS2 9 | from .instructblip import InstructBLIP 10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner 11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V 12 | from .minigpt4 import MiniGPT4 13 | from .mmalaya import MMAlaya 14 | from .monkey import Monkey, MonkeyChat 15 | from .mplug_owl2 import mPLUG_Owl2 16 | from .omnilmm import OmniLMM12B 17 | from .open_flamingo import OpenFlamingo 18 | from .pandagpt import PandaGPT 19 | from .qwen_vl import QwenVL, QwenVLChat 20 | from .transcore_m import TransCoreM 21 | from .visualglm import VisualGLM 22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD 23 | from .yi_vl import Yi_VL 24 | from .internvl_chat import InternVLChat 25 | from .deepseek_vl import DeepSeekVL 26 | from .mgm import Mini_Gemini 27 | from .bunnyllama3 import BunnyLLama3 28 | from .vxverse import VXVERSE 29 | from .paligemma import PaliGemma 30 | from .qh_360vl import QH_360VL 31 | from .phi3_vision import Phi3Vision 32 | from .wemm import WeMM 33 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/bunnyllama3.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/bunnyllama3.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/deepseek_vl.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/deepseek_vl.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/emu.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/emu.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/internvl_chat.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/internvl_chat.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/minicpm_v.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/minicpm_v.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/mmalaya.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mmalaya.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/omnilmm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/omnilmm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/qwen_vl.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/qwen_vl.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/vxverse.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/vxverse.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/base.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | from ..utils.dataset_config import img_root_map 3 | from abc import abstractmethod 4 | 5 | 6 | class BaseModel: 7 | 8 | INTERLEAVE = False 9 | allowed_types = ['text', 'image'] 10 | 11 | def use_custom_prompt(self, dataset): 12 | """Whether to use custom prompt for the given dataset. 13 | 14 | Args: 15 | dataset (str): The name of the dataset. 16 | 17 | Returns: 18 | bool: Whether to use custom prompt. If True, will call `build_prompt` of the VLM to build the prompt. 19 | Default to False. 20 | """ 21 | return False 22 | 23 | @abstractmethod 24 | def build_prompt(self, line, dataset): 25 | """Build custom prompts for a specific dataset. Called only if `use_custom_prompt` returns True. 26 | 27 | Args: 28 | line (line of pd.DataFrame): The raw input line. 29 | dataset (str): The name of the dataset. 30 | 31 | Returns: 32 | str: The built message. 33 | """ 34 | raise NotImplementedError 35 | 36 | def dump_image(self, line, dataset): 37 | """Dump the image(s) of the input line to the corresponding dataset folder. 38 | 39 | Args: 40 | line (line of pd.DataFrame): The raw input line. 41 | dataset (str): The name of the dataset. 42 | 43 | Returns: 44 | str | list[str]: The paths of the dumped images. 45 | """ 46 | ROOT = LMUDataRoot() 47 | assert isinstance(dataset, str) 48 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) 49 | os.makedirs(img_root, exist_ok=True) 50 | if 'image' in line: 51 | if isinstance(line['image'], list): 52 | tgt_path = [] 53 | assert 'image_path' in line 54 | for img, im_name in zip(line['image'], line['image_path']): 55 | path = osp.join(img_root, im_name) 56 | if not read_ok(path): 57 | decode_base64_to_image_file(img, path) 58 | tgt_path.append(path) 59 | else: 60 | tgt_path = osp.join(img_root, f"{line['index']}.jpg") 61 | if not read_ok(tgt_path): 62 | decode_base64_to_image_file(line['image'], tgt_path) 63 | tgt_path = [tgt_path] 64 | else: 65 | assert 'image_path' in line 66 | tgt_path = toliststr(line['image_path']) 67 | 68 | return tgt_path 69 | 70 | @abstractmethod 71 | def generate_inner(self, message, dataset=None): 72 | raise NotImplementedError 73 | 74 | def check_content(self, msgs): 75 | """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict. 76 | """ 77 | if isinstance(msgs, str): 78 | return 'str' 79 | if isinstance(msgs, dict): 80 | return 'dict' 81 | if isinstance(msgs, list): 82 | types = [self.check_content(m) for m in msgs] 83 | if all(t == 'str' for t in types): 84 | return 'liststr' 85 | if all(t == 'dict' for t in types): 86 | return 'listdict' 87 | return 'unknown' 88 | 89 | def preproc_content(self, inputs): 90 | """Convert the raw input messages to a list of dicts. 91 | 92 | Args: 93 | inputs: raw input messages. 94 | 95 | Returns: 96 | list(dict): The preprocessed input messages. Will return None if failed to preprocess the input. 97 | """ 98 | if self.check_content(inputs) == 'str': 99 | return [dict(type='text', value=inputs)] 100 | elif self.check_content(inputs) == 'dict': 101 | assert 'type' in inputs and 'value' in inputs 102 | return [inputs] 103 | elif self.check_content(inputs) == 'liststr': 104 | res = [] 105 | for s in inputs: 106 | mime, pth = parse_file(s) 107 | if mime is None or mime == 'unknown': 108 | res.append(dict(type='text', value=s)) 109 | else: 110 | res.append(dict(type=mime.split('/')[0], value=pth)) 111 | return res 112 | elif self.check_content(inputs) == 'listdict': 113 | for item in inputs: 114 | assert 'type' in item and 'value' in item 115 | mime, s = parse_file(item['value']) 116 | if mime is None: 117 | assert item['type'] == 'text' 118 | else: 119 | assert mime.split('/')[0] == item['type'] 120 | item['value'] = s 121 | return inputs 122 | else: 123 | return None 124 | 125 | def generate(self, message, dataset=None): 126 | """Generate the output message. 127 | 128 | Args: 129 | message (list[dict]): The input message. 130 | dataset (str, optional): The name of the dataset. Defaults to None. 131 | 132 | Returns: 133 | str: The generated message. 134 | """ 135 | assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}' 136 | message = self.preproc_content(message) 137 | assert message is not None and self.check_content(message) == 'listdict' 138 | for item in message: 139 | assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}' 140 | return self.generate_inner(message, dataset) 141 | 142 | def message_to_promptimg(self, message): 143 | assert not self.INTERLEAVE 144 | model_name = self.__class__.__name__ 145 | warnings.warn( 146 | f'Model {model_name} does not support interleaved input. ' 147 | 'Will use the first image and aggregated texts as prompt. ') 148 | num_images = len([x for x in message if x['type'] == 'image']) 149 | if num_images == 0: 150 | prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) 151 | image = None 152 | else: 153 | prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) 154 | image = [x['value'] for x in message if x['type'] == 'image'][0] 155 | return prompt, image 156 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/bunnyllama3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | from .base import BaseModel 8 | from ..smp import * 9 | from ..utils import DATASET_TYPE 10 | 11 | 12 | class BunnyLLama3(BaseModel): 13 | 14 | INSTALL_REQ = False 15 | INTERLEAVE = False 16 | 17 | def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs): 18 | assert model_path is not None 19 | transformers.logging.set_verbosity_error() 20 | transformers.logging.disable_progress_bar() 21 | warnings.filterwarnings('ignore') 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 23 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True) 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | text = f"A chat between a curious user and an artificial intelligence assistant. \ 29 | The assistant gives helpful, detailed, and polite answers to the user's questions. \ 30 | USER: \n{prompt} ASSISTANT:" 31 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')] 32 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0) 33 | image = Image.open(image_path).convert('RGB') 34 | image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype) 35 | 36 | output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0] 37 | response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True) 38 | return response 39 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/cogvlm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from .base import BaseModel 4 | from ..smp import * 5 | from ..utils import DATASET_TYPE 6 | from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer 7 | 8 | 9 | class GLM4v(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = False 13 | 14 | def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs): 15 | assert model_path is not None 16 | self.model_path = model_path 17 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 18 | self.model = AutoModelForCausalLM.from_pretrained( 19 | model_path, 20 | torch_dtype=torch.bfloat16, 21 | low_cpu_mem_usage=True, 22 | trust_remote_code=True 23 | ).to('cuda').eval() 24 | gen_kwargs = {'max_length': 2048, 'do_sample': False} 25 | gen_kwargs.update(kwargs) 26 | self.kwargs = gen_kwargs 27 | self.end_text_token = '<|endoftext|>' 28 | 29 | def generate_inner(self, message, dataset=None): 30 | prompt, image_path = self.message_to_promptimg(message) 31 | image = Image.open(image_path).convert('RGB') 32 | inputs = self.tokenizer.apply_chat_template( 33 | [{'role': 'user', 'image': image, 'content': prompt}], 34 | add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True 35 | ) 36 | inputs = inputs.to('cuda') 37 | 38 | with torch.no_grad(): 39 | outputs = self.model.generate(**inputs, **self.kwargs) 40 | outputs = outputs[:, inputs['input_ids'].shape[1]:] 41 | response = self.tokenizer.decode(outputs[0]) 42 | return response.split(self.end_text_token)[0] 43 | 44 | 45 | class CogVlm(BaseModel): 46 | 47 | INSTALL_REQ = False 48 | INTERLEAVE = False 49 | 50 | def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): 51 | assert model_path is not None 52 | model = AutoModelForCausalLM.from_pretrained( 53 | model_path, 54 | torch_dtype=torch.bfloat16, 55 | trust_remote_code=True, 56 | ).to('cuda').eval() 57 | 58 | self.kwargs = kwargs 59 | if tokenizer_name: 60 | tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) 61 | gen_kwargs = {'max_length': 2048, 'do_sample': False} 62 | self.end_text_token = '' 63 | else: 64 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 65 | gen_kwargs = {'max_new_tokens': 2048, 'pad_token_id': 128002} 66 | self.end_text_token = '<|end_of_text|>' 67 | self.kwargs.update(gen_kwargs) 68 | self.tokenizer = tokenizer 69 | self.model = model 70 | 71 | def use_custom_prompt(self, dataset): 72 | assert dataset is not None 73 | if DATASET_TYPE(dataset) == 'multi-choice': 74 | return True 75 | return False 76 | 77 | def build_prompt(self, line, dataset=None): 78 | assert dataset is None or isinstance(dataset, str) 79 | assert self.use_custom_prompt(dataset) 80 | tgt_path = self.dump_image(line, dataset) 81 | 82 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 83 | question = line['question'] 84 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 85 | if hint is not None: 86 | question = hint + '\n' + question 87 | 88 | option_candidate = string.ascii_uppercase 89 | options = { 90 | cand: line[cand] 91 | for cand in option_candidate 92 | if cand in line and not pd.isna(line[cand]) 93 | } 94 | for key, item in options.items(): 95 | question += f'\n{key}. {item}' 96 | prompt = question 97 | 98 | if not cn_string(prompt): 99 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." 100 | else: 101 | prompt = prompt + '\n' + '请直接回答选项字母。' 102 | else: 103 | prompt = line['question'] 104 | message = [dict(type='text', value=prompt)] 105 | message.extend([dict(type='image', value=p) for p in tgt_path]) 106 | 107 | return message 108 | 109 | def generate_inner(self, message, dataset=None): 110 | prompt, image_path = self.message_to_promptimg(message) 111 | 112 | image = Image.open(image_path).convert('RGB') 113 | inputs = self.model.build_conversation_input_ids( 114 | self.tokenizer, query=prompt, history=[], images=[image]) # chat mode 115 | inputs = { 116 | 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), 117 | 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), 118 | 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), 119 | 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], 120 | } 121 | 122 | with torch.no_grad(): 123 | outputs = self.model.generate(**inputs, **self.kwargs) 124 | outputs = outputs[:, inputs['input_ids'].shape[1]:] 125 | response = self.tokenizer.decode(outputs[0]) 126 | response = response.split(self.end_text_token)[0].strip() 127 | return response 128 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/deepseek_vl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from transformers import AutoModelForCausalLM 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class DeepSeekVL(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = True 12 | 13 | def check_install(self): 14 | try: 15 | import deepseek_vl 16 | except ImportError: 17 | warnings.warn( 18 | 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL') 19 | sys.exit(-1) 20 | 21 | def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs): 22 | self.check_install() 23 | assert model_path is not None 24 | self.model_path = model_path 25 | from deepseek_vl.models import VLChatProcessor 26 | 27 | self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) 28 | self.tokenizer = self.vl_chat_processor.tokenizer 29 | 30 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) 31 | self.model = model.to(torch.bfloat16).cuda().eval() 32 | 33 | torch.cuda.empty_cache() 34 | default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True) 35 | default_kwargs.update(kwargs) 36 | self.kwargs = default_kwargs 37 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 38 | 39 | def prepare_inputs(self, message): 40 | content, images = '', [] 41 | for s in message: 42 | if s['type'] == 'image': 43 | images.append(s['value']) 44 | content += '' 45 | elif s['type'] == 'text': 46 | content += s['value'] 47 | conversation = [ 48 | dict(role='User', content=content, images=images), 49 | dict(role='Assistant', content='') 50 | ] 51 | return conversation 52 | 53 | def generate_inner(self, message, dataset=None): 54 | conversation = self.prepare_inputs(message) 55 | from deepseek_vl.utils.io import load_pil_images 56 | pil_images = load_pil_images(conversation) 57 | prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True) 58 | prepare_inputs = prepare_inputs.to(self.model.device) 59 | inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs) 60 | 61 | outputs = self.model.language_model.generate( 62 | inputs_embeds=inputs_embeds, 63 | attention_mask=prepare_inputs.attention_mask, 64 | pad_token_id=self.tokenizer.eos_token_id, 65 | bos_token_id=self.tokenizer.bos_token_id, 66 | eos_token_id=self.tokenizer.eos_token_id, 67 | **self.kwargs) 68 | answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True) 69 | return answer 70 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/emu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class Emu(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, 15 | model_path='BAAI/Emu2-Chat', 16 | **kwargs): 17 | 18 | self.model_path = model_path 19 | assert osp.exists(model_path) or splitlen(model_path) == 2 20 | 21 | from transformers import AutoModelForCausalLM, AutoTokenizer 22 | from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model 23 | 24 | local_rank = os.environ.get('LOCAL_RANK', 0) 25 | 26 | device_num = torch.cuda.device_count() 27 | assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' 28 | assert device_num >= 2, 'You need at least 2 GPUs to use EMU' 29 | 30 | device_1 = local_rank 31 | device_2 = local_rank + device_num // 2 32 | 33 | torch.cuda.set_device(device_1) 34 | torch.cuda.set_device(device_2) 35 | 36 | tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat" 37 | self.tokenizer = tokenizer 38 | with init_empty_weights(): 39 | model = AutoModelForCausalLM.from_pretrained( 40 | model_path, # "BAAI/Emu2-Chat" 41 | torch_dtype=torch.bfloat16, 42 | low_cpu_mem_usage=True, 43 | trust_remote_code=True) 44 | 45 | device_map = infer_auto_device_map( 46 | model, 47 | max_memory={ 48 | device_1: '70GiB', 49 | device_2: '70GiB' 50 | }, 51 | no_split_module_classes=['Block', 'LlamaDecoderLayer']) 52 | 53 | # input and output logits should be on same device 54 | device_map['model.decoder.lm.lm_head'] = device_1 55 | 56 | model = dispatch_model( 57 | model, 58 | device_map=device_map).eval() 59 | 60 | self.model = model 61 | kwargs_default = dict(max_new_tokens=512, length_penalty=-1) 62 | kwargs_default.update(kwargs) 63 | self.kwargs = kwargs_default 64 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 65 | 66 | def generate_inner(self, message, dataset=None): 67 | query, images = '', [] 68 | for item in message: 69 | if item['type'] == 'image': 70 | images.append(Image.open(item['value']).convert('RGB')) 71 | query += '[]' 72 | elif item['type'] == 'text': 73 | query += item['value'] 74 | 75 | inputs = self.model.build_input_ids( 76 | text=[query], 77 | tokenizer=self.tokenizer, 78 | image=images 79 | ) 80 | 81 | with torch.no_grad(): 82 | outputs = self.model.generate( 83 | input_ids=inputs['input_ids'], 84 | attention_mask=inputs['attention_mask'], 85 | image=inputs['image'].to(torch.bfloat16), 86 | **self.kwargs) 87 | 88 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) 89 | return output_text[0] 90 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/instructblip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import os.path as osp 4 | import sys 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class InstructBLIP(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name): 15 | self.config_map = { 16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml', 17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml', 18 | } 19 | 20 | self.file_path = __file__ 21 | config_root = osp.dirname(self.file_path) 22 | 23 | try: 24 | from lavis.models import load_preprocess 25 | from omegaconf import OmegaConf 26 | from lavis.common.registry import registry 27 | except: 28 | warnings.warn('Please install lavis before using InstructBLIP. ') 29 | sys.exit(-1) 30 | 31 | assert name in self.config_map 32 | cfg_path = osp.join(config_root, self.config_map[name]) 33 | cfg = OmegaConf.load(cfg_path) 34 | 35 | model_cfg = cfg.model 36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2 37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct') 38 | model = model_cls.from_config(model_cfg) 39 | model.eval() 40 | 41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 42 | device = self.device 43 | model.to(device) 44 | self.model = model 45 | self.kwargs = {'max_length': 512} 46 | 47 | preprocess_cfg = cfg.preprocess 48 | vis_processors, _ = load_preprocess(preprocess_cfg) 49 | self.vis_processors = vis_processors 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message) 53 | vis_processors = self.vis_processors 54 | raw_image = Image.open(image_path).convert('RGB') 55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device) 56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt)) 57 | return outputs[0] 58 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava import LLaVA, LLaVA_Next 2 | from .llava_xtuner import LLaVA_XTuner 3 | 4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner'] 5 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/minigpt4.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from transformers import StoppingCriteriaList 6 | from .base import BaseModel 7 | 8 | 9 | class MiniGPT4(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, 15 | mode='v2', 16 | root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/', 17 | temperature=1, 18 | max_out_len=512): 19 | 20 | if root is None: 21 | warnings.warn( 22 | 'Please set root to the directory of MiniGPT-4, which is cloned from here: ' 23 | 'https://github.com/Vision-CAIR/MiniGPT-4. ' 24 | ) 25 | 26 | if mode == 'v2': 27 | cfg = 'minigptv2_eval.yaml' 28 | elif mode == 'v1_7b': 29 | cfg = 'minigpt4_7b_eval.yaml' 30 | elif mode == 'v1_13b': 31 | cfg = 'minigpt4_13b_eval.yaml' 32 | else: 33 | raise NotImplementedError 34 | 35 | self.mode = mode 36 | self.temperature = temperature 37 | self.max_out_len = max_out_len 38 | self.root = root 39 | this_dir = osp.dirname(__file__) 40 | 41 | self.cfg = osp.join(this_dir, 'misc', cfg) 42 | sys.path.append(self.root) 43 | 44 | from omegaconf import OmegaConf 45 | from minigpt4.common.registry import registry 46 | from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2 47 | 48 | device = torch.cuda.current_device() 49 | self.device = device 50 | 51 | cfg_path = self.cfg 52 | cfg = OmegaConf.load(cfg_path) 53 | 54 | model_cfg = cfg.model 55 | model_cfg.device_8bit = device 56 | model_cls = registry.get_model_class(model_cfg.arch) 57 | model = model_cls.from_config(model_cfg) 58 | model = model.to(device) 59 | model.eval() 60 | vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train 61 | vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) 62 | self.model = model 63 | self.vis_processor = vis_processor 64 | 65 | self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0 66 | stop_words_ids = [[835], [2277, 29937]] 67 | stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids] 68 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 69 | 70 | def generate_inner(self, message, dataset=None): 71 | from minigpt4.conversation.conversation import Chat 72 | prompt, image_path = self.message_to_promptimg(message) 73 | if self.mode == 'v2': 74 | chat = Chat(self.model, self.vis_processor, device=self.device) 75 | else: 76 | chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria) 77 | 78 | chat_state = self.CONV_VISION.copy() 79 | img_list = [] 80 | _ = chat.upload_img(image_path, chat_state, img_list) 81 | chat.encode_img(img_list) 82 | chat.ask(prompt, chat_state) 83 | with torch.inference_mode(): 84 | msg = chat.answer(conv=chat_state, img_list=img_list)[0] 85 | return msg 86 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-13b-v0" 25 | 26 | datasets: 27 | cc_sbu_align: 28 | vis_processor: 29 | train: 30 | name: "blip2_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | 36 | run: 37 | task: image_text_pretrain 38 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-7b-v0" 25 | 26 | 27 | datasets: 28 | cc_sbu_align: 29 | vis_processor: 30 | train: 31 | name: "blip2_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | 37 | run: 38 | task: image_text_pretrain 39 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt_v2 3 | model_type: pretrain 4 | max_txt_len: 160 5 | end_sym: "" 6 | low_resource: True 7 | prompt_template: '[INST] {} [/INST]' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | lora_r: 64 10 | lora_alpha: 16 11 | 12 | # vit encoder 13 | image_size: 448 14 | drop_path_rate: 0 15 | use_grad_checkpoint: False 16 | vit_precision: "fp16" 17 | freeze_vit: True 18 | 19 | # generation configs 20 | prompt: "" 21 | 22 | # LLM 23 | llama_model: "please set this value to the path of llama2-chat-7b" 24 | 25 | datasets: 26 | cc_sbu_align: 27 | vis_processor: 28 | train: 29 | name: "blip2_image_eval" 30 | image_size: 448 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | 35 | run: 36 | task: image_text_pretrain 37 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/mmalaya.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | from PIL import Image 5 | from .base import BaseModel 6 | 7 | 8 | class MMAlaya(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs): 14 | assert model_path is not None 15 | self.model_path = model_path 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 17 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval() 18 | # need initialize tokenizer 19 | model.initialize_tokenizer(self.tokenizer) 20 | self.model = model.cuda() 21 | 22 | self.kwargs = kwargs 23 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 24 | torch.cuda.empty_cache() 25 | 26 | def generate_inner(self, message, dataset=None): 27 | # read image 28 | prompt, image_path = self.message_to_promptimg(message) 29 | image = Image.open(image_path).convert('RGB') 30 | # tokenize prompt, and proprecess image 31 | input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference( 32 | prompt, 33 | self.tokenizer, 34 | image, 35 | return_tensors='pt') 36 | with torch.inference_mode(): 37 | output_ids = self.model.generate( 38 | inputs=input_ids.cuda(), 39 | images=image_tensor.cuda(), 40 | do_sample=False, 41 | max_new_tokens=512, 42 | num_beams=1, 43 | use_cache=True, 44 | stopping_criteria=[stopping_criteria], 45 | ) 46 | # truncate input_ids in generate_ids and then decode to text 47 | input_token_len = input_ids.shape[1] 48 | response = self.tokenizer.batch_decode( 49 | output_ids[:, input_token_len:].cpu(), 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0].strip() 53 | return response 54 | 55 | 56 | if __name__ == '__main__': 57 | model = MMAlaya() 58 | response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。']) 59 | print(response) 60 | 61 | """ 62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit 63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py 64 | """ 65 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/mplug_owl2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from PIL import Image 4 | from .base import BaseModel 5 | from ..smp import * 6 | from ..utils import DATASET_TYPE 7 | 8 | 9 | class mPLUG_Owl2(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs): 15 | try: 16 | from mplug_owl2.model.builder import load_pretrained_model 17 | from mplug_owl2.mm_utils import get_model_name_from_path 18 | except: 19 | warnings.warn('Please install mPLUG_Owl2 before using mPLUG_Owl2. ') 20 | sys.exit(-1) 21 | 22 | model_name = get_model_name_from_path(model_path) 23 | tokenizer, model, image_processor, context_len = load_pretrained_model( 24 | model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu') 25 | 26 | self.model = model.cuda() 27 | self.device = self.model.device 28 | self.image_processor = image_processor 29 | tokenizer.padding_side = 'left' 30 | tokenizer.pad_token_id = tokenizer.eos_token_id 31 | self.tokenizer = tokenizer 32 | self.context_len = context_len 33 | 34 | kwargs_default = dict( 35 | max_new_tokens=512, do_sample=False, num_beams=1, 36 | min_new_tokens=1, length_penalty=1, num_return_sequences=1) 37 | kwargs_default.update(kwargs) 38 | self.kwargs = kwargs_default 39 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 40 | 41 | def use_custom_prompt(self, dataset): 42 | assert dataset is not None 43 | if listinstr(['MMMU'], dataset): 44 | return False 45 | if DATASET_TYPE(dataset) == 'multi-choice' or dataset == 'MMVet': 46 | return True 47 | return False 48 | 49 | def build_prompt(self, line, dataset=None): 50 | assert dataset is None or isinstance(dataset, str) 51 | assert self.use_custom_prompt(dataset) 52 | tgt_path = self.dump_image(line, dataset) 53 | question = line['question'] 54 | if dataset == 'MMVet': 55 | prompt = question + '\nAnswer the question directly. ' 56 | elif DATASET_TYPE(dataset) == 'multi-choice': 57 | options = { 58 | cand: line[cand] 59 | for cand in string.ascii_uppercase 60 | if cand in line and not pd.isna(line[cand]) 61 | } 62 | options_prompt = '' 63 | for key, item in options.items(): 64 | options_prompt += f'{key}. {item}\n' 65 | 66 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 67 | prompt = f'Hint: {hint}\n' if hint is not None else '' 68 | prompt += f'{question}\n' 69 | prompt += ( 70 | f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. ' 71 | if len(options) else 'Answer the question directly. ' 72 | ) 73 | else: 74 | raise NotImplementedError 75 | 76 | message = [dict(type='text', value=prompt)] 77 | message.extend([dict(type='image', value=s) for s in tgt_path]) 78 | return message 79 | 80 | def generate_inner(self, message, dataset=None): 81 | from mplug_owl2.constants import IMAGE_TOKEN_INDEX 82 | from mplug_owl2.mm_utils import process_images, tokenizer_image_token 83 | kwargs = cp.deepcopy(self.kwargs) 84 | if dataset in ['MMVet', 'LLaVABench']: 85 | kwargs['length_penalty'] = 0 86 | elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': 87 | kwargs['length_penalty'] = 0 88 | elif dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 89 | kwargs['max_new_tokens'] = 10 90 | num_images = len([x for x in message if x['type'] == 'image']) 91 | assert num_images >= 0 92 | prompt_full = 'USER: ' 93 | images = [] 94 | if num_images == 1: 95 | prompt, image = self.message_to_promptimg(message) 96 | prompt_full += f'<|image|>{prompt} \nASSISTANT: ' 97 | images.append(image) 98 | else: 99 | for msg in message: 100 | if msg['type'] == 'image': 101 | images.append(msg['value']) 102 | prompt_full += '<|image|>' 103 | elif msg['type'] == 'text': 104 | prompt_full += msg['value'] 105 | prompt_full += '\nASSISTANT: ' 106 | 107 | def preproc_image(fname): 108 | image = Image.open(fname).convert('RGB') 109 | max_edge = max(image.size) 110 | image = image.resize((max_edge, max_edge)) 111 | return image 112 | images = [preproc_image(fname) for fname in images] 113 | image_tensor = process_images(images, self.image_processor) 114 | image_tensor = image_tensor.to(self.device, dtype=torch.float16) 115 | input_ids = tokenizer_image_token( 116 | prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) 117 | 118 | with torch.inference_mode(): 119 | output_ids = self.model.generate( 120 | input_ids=input_ids, 121 | images=image_tensor, 122 | output_hidden_states=True, 123 | use_cache=True, 124 | **kwargs) 125 | answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() 126 | return answer.split('')[0] 127 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/omnilmm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from PIL import Image 4 | from transformers import AutoTokenizer 5 | 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..utils import DATASET_TYPE 9 | 10 | 11 | DEFAULT_IMAGE_TOKEN = '' 12 | DEFAULT_IMAGE_PATCH_TOKEN = '' 13 | DEFAULT_IM_START_TOKEN = '' 14 | DEFAULT_IM_END_TOKEN = '' 15 | 16 | 17 | def init_omni_lmm(model_path): 18 | from omnilmm.model.omnilmm import OmniLMMForCausalLM 19 | from omnilmm.utils import disable_torch_init 20 | from omnilmm.model.utils import build_transform 21 | 22 | torch.backends.cuda.matmul.allow_tf32 = True 23 | disable_torch_init() 24 | tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048) 25 | 26 | model = OmniLMMForCausalLM.from_pretrained(model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu') 27 | model = model.to(device='cuda', dtype=torch.bfloat16) 28 | 29 | image_processor = build_transform(is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP') 30 | 31 | mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False) 32 | assert mm_use_im_start_end 33 | 34 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) 35 | 36 | vision_config = model.model.vision_config 37 | vision_config.im_patch_token = tokenizer.convert_tokens_to_ids( 38 | [DEFAULT_IMAGE_PATCH_TOKEN])[0] 39 | vision_config.use_im_start_end = mm_use_im_start_end 40 | vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids( 41 | [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN]) 42 | image_token_len = model.model.config.num_query 43 | 44 | return model, image_processor, image_token_len, tokenizer 45 | 46 | 47 | def expand_question_into_multimodal(question_text, image_token_len, im_st_token, im_ed_token, im_patch_token): 48 | if '' in question_text[0]['content']: 49 | question_text[0]['content'] = question_text[0]['content'].replace( 50 | '', im_st_token + im_patch_token * image_token_len + im_ed_token) 51 | else: 52 | question_text[0]['content'] = im_st_token + im_patch_token * \ 53 | image_token_len + im_ed_token + '\n' + question_text[0]['content'] 54 | return question_text 55 | 56 | 57 | def wrap_question_for_omni_lmm(question, image_token_len, tokenizer): 58 | from omnilmm.train.train_utils import omni_preprocess 59 | question = expand_question_into_multimodal( 60 | question, image_token_len, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN) 61 | 62 | conversation = question 63 | data_dict = omni_preprocess(sources=[conversation], tokenizer=tokenizer, generation=True) 64 | 65 | data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0]) 66 | return data_dict 67 | 68 | 69 | class OmniLMM12B(BaseModel): 70 | 71 | INSTALL_REQ = True 72 | INTERLEAVE = False 73 | 74 | def __init__(self, model_path, root, **kwargs) -> None: 75 | sys.path.append(root) 76 | model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) 77 | self.model = model 78 | self.image_token_len = image_token_len 79 | self.image_transform = img_processor 80 | self.tokenizer = tokenizer 81 | self.model.eval() 82 | default_kwargs = dict( 83 | max_new_tokens=512, 84 | do_sample=False, 85 | output_scores=True, 86 | return_dict_in_generate=True, 87 | repetition_penalty=1.1) 88 | default_kwargs.update(kwargs) 89 | self.kwargs = default_kwargs 90 | torch.cuda.empty_cache() 91 | 92 | def generate_inner(self, message, dataset=None): 93 | prompt, image_path = self.message_to_promptimg(message) 94 | try: 95 | image = Image.open(image_path).convert('RGB') 96 | except: 97 | logger = get_logger('OmniLMM Inference') 98 | logger.error('Image Decode Error') 99 | return 'Image Decode Error' 100 | 101 | msgs = [dict(role='user', content=prompt)] 102 | input_ids = wrap_question_for_omni_lmm( 103 | msgs, self.image_token_len, self.tokenizer)['input_ids'] 104 | input_ids = torch.as_tensor(input_ids) 105 | image = self.image_transform(image) 106 | 107 | with torch.inference_mode(): 108 | output = self.model.generate_vllm( 109 | input_ids=input_ids.unsqueeze(0).cuda(), 110 | images=image.unsqueeze(0).half().cuda(), 111 | **self.kwargs) 112 | 113 | response = self.tokenizer.decode( 114 | output.sequences[0], skip_special_tokens=True) 115 | response = response.strip() 116 | return response 117 | 118 | def use_custom_prompt(self, dataset): 119 | assert dataset is not None 120 | if DATASET_TYPE(dataset) == 'multi-choice': 121 | return True 122 | return False 123 | 124 | def build_prompt(self, line, dataset=None): 125 | assert dataset is None or isinstance(dataset, str) 126 | assert self.use_custom_prompt(dataset) 127 | tgt_path = self.dump_image(line, dataset) 128 | 129 | question = line['question'] 130 | options = { 131 | cand: line[cand] 132 | for cand in string.ascii_uppercase 133 | if cand in line and not pd.isna(line[cand]) 134 | } 135 | options_prompt = 'Options:\n' 136 | for key, item in options.items(): 137 | options_prompt += f'{key}. {item}\n' 138 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 139 | prompt = '' 140 | if hint is not None: 141 | prompt += f'Hint: {hint}\n' 142 | prompt += f'{question}\n' 143 | if len(options): 144 | prompt += options_prompt 145 | prompt = """ 146 | Study the image carefully and pick the option associated with the correct answer. 147 | Focus solely on selecting the option and avoid including any other content.\n 148 | """ + prompt 149 | 150 | message = [dict(type='text', value=prompt)] 151 | message.extend([dict(type='image', value=s) for s in tgt_path]) 152 | return message 153 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/open_flamingo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | import warnings 6 | from .base import BaseModel 7 | from ..smp import splitlen, get_cache_path 8 | from huggingface_hub import snapshot_download 9 | 10 | 11 | class OpenFlamingo(BaseModel): 12 | 13 | INSTALL_REQ = True 14 | INTERLEAVE = True 15 | 16 | def __init__(self, 17 | name, 18 | mpt_pth=None, 19 | ckpt_pth=None, 20 | **kwargs): 21 | 22 | if mpt_pth is None: 23 | warnings.warn( 24 | 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: ' 25 | 'https://huggingface.co/mosaicml/mpt-7b. ' 26 | ) 27 | sys.exit(-1) 28 | if ckpt_pth is None: 29 | warnings.warn( 30 | 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded ' 31 | 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. ' 32 | ) 33 | sys.exit(-1) 34 | else: 35 | if osp.exists(ckpt_pth): 36 | if ckpt_pth.endswith('checkpoint.pt'): 37 | pass 38 | elif osp.isdir(ckpt_pth): 39 | ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt') 40 | if not osp.exists(ckpt_pth): 41 | sys.exit(-1) 42 | elif splitlen(ckpt_pth, '/') == 2: 43 | cache_path = get_cache_path(ckpt_pth) 44 | if cache_path is None: 45 | snapshot_download(ckpt_pth) 46 | cache_path = get_cache_path(ckpt_pth) 47 | if cache_path is None: 48 | sys.exit(-1) 49 | else: 50 | ckpt_pth = osp.join(cache_path, 'checkpoint.pt') 51 | 52 | self.name = name 53 | assert name in ['v2'] 54 | self.mpt_pth = mpt_pth 55 | try: 56 | from open_flamingo import create_model_and_transforms 57 | except: 58 | raise ImportError('Please first install open_flamingo to use OpenFlamingo') 59 | model, image_processor, tokenizer = create_model_and_transforms( 60 | clip_vision_encoder_path='ViT-L-14', 61 | clip_vision_encoder_pretrained='openai', 62 | lang_encoder_path=mpt_pth, 63 | tokenizer_path=mpt_pth, 64 | cross_attn_every_n_layers=4) 65 | ckpt = torch.load(ckpt_pth) 66 | model.load_state_dict(ckpt, strict=False) 67 | torch.cuda.empty_cache() 68 | self.model = model.eval().cuda() 69 | self.tokenizer = tokenizer 70 | self.tokenizer.padding_side = 'left' 71 | self.image_proc = image_processor 72 | 73 | kwargs_default = dict(max_new_tokens=512, num_beams=3) 74 | kwargs_default.update(kwargs) 75 | self.kwargs = kwargs_default 76 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 77 | 78 | def generate_inner(self, message, dataset=None): 79 | vision_x = [] 80 | prompt = '' 81 | for msg in message: 82 | if msg['type'] == 'image': 83 | img = Image.open(msg['value']) 84 | vision_x.append(self.image_proc(img).unsqueeze(0)) 85 | prompt += '' 86 | elif msg['type'] == 'text': 87 | prompt += msg['value'] 88 | prompt += 'Answer: ' 89 | vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0] 90 | vision_x = vision_x.unsqueeze(1).unsqueeze(0) 91 | lang_x = self.tokenizer([prompt], return_tensors='pt') 92 | generated_text = self.model.generate( 93 | vision_x=vision_x.cuda(), 94 | lang_x=lang_x['input_ids'].cuda(), 95 | attention_mask=lang_x['attention_mask'].cuda(), 96 | **self.kwargs) 97 | generated_text = self.tokenizer.decode(generated_text[0]) 98 | text = generated_text[len(prompt):].split('<|endofchunk|>')[0] 99 | return text 100 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/paligemma.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class PaliGemma(BaseModel): 9 | INSTALL_REQ = False 10 | INTERLEAVE = False 11 | 12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs): 13 | try: 14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration 15 | except: 16 | warnings.warn('Please install the latest version transformers.') 17 | sys.exit(-1) 18 | model = PaliGemmaForConditionalGeneration.from_pretrained( 19 | model_path, 20 | torch_dtype=torch.bfloat16, 21 | device_map='cpu', 22 | revision='bfloat16', 23 | ).eval() 24 | self.model = model.cuda() 25 | self.processor = AutoProcessor.from_pretrained(model_path) 26 | self.kwargs = kwargs 27 | 28 | def generate_inner(self, message, dataset=None): 29 | prompt, image_path = self.message_to_promptimg(message) 30 | image = Image.open(image_path).convert('RGB') 31 | 32 | model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda') 33 | input_len = model_inputs['input_ids'].shape[-1] 34 | 35 | with torch.inference_mode(): 36 | generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False) 37 | generation = generation[0][input_len:] 38 | res = self.processor.decode(generation, skip_special_tokens=True) 39 | return res 40 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/pandagpt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import os.path as osp 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class PandaGPT(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = False 12 | 13 | def __init__(self, name, root=None, **kwargs): 14 | if root is None: 15 | warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ') 16 | sys.exit(-1) 17 | 18 | assert name == 'PandaGPT_13B' 19 | self.name = name 20 | sys.path.append(osp.join(root, 'code')) 21 | try: 22 | from model.openllama import OpenLLAMAPEFTModel 23 | except: 24 | raise ImportError( 25 | 'Please first install PandaGPT and set the root path to use PandaGPT, ' 26 | 'which is cloned from here: https://github.com/yxuansu/PandaGPT. ' 27 | ) 28 | self.args = { 29 | 'model': 'openllama_peft', 30 | 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'), 31 | 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'), 32 | 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'), 33 | 'stage': 2, 34 | 'max_tgt_len': 512, 35 | 'lora_r': 32, 36 | 'lora_alpha': 32, 37 | 'lora_dropout': 0.1, 38 | } 39 | model = OpenLLAMAPEFTModel(**self.args) 40 | delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) 41 | model.load_state_dict(delta_ckpt, strict=False) 42 | torch.cuda.empty_cache() 43 | self.model = model.eval().half().cuda() 44 | kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} 45 | kwargs_default.update(kwargs) 46 | self.kwargs = kwargs_default 47 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 48 | 49 | def generate_inner(self, message, dataset=None): 50 | prompt, image_path = self.message_to_promptimg(message) 51 | struct = { 52 | 'prompt': prompt, 53 | 'image_paths': [image_path], 54 | 'audio_paths': [], 55 | 'video_paths': [], 56 | 'thermal_paths': [], 57 | 'modality_embeds': [] 58 | } 59 | struct.update(self.kwargs) 60 | resp = self.model.generate(struct) 61 | return resp 62 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/phi3_vision.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class Phi3Vision(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs): 14 | try: 15 | from transformers import AutoProcessor, AutoModelForCausalLM 16 | except: 17 | warnings.warn('Please install the latest version transformers.') 18 | sys.exit(-1) 19 | model = AutoModelForCausalLM.from_pretrained( 20 | model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval() 21 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 22 | self.model = model 23 | self.processor = processor 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | image = Image.open(image_path).convert('RGB') 29 | messages = [ 30 | {'role': 'user', 'content': f'<|image_1|>\n{prompt}'} 31 | ] 32 | prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 33 | inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda') 34 | 35 | generation_args = { 36 | 'max_new_tokens': 500, 37 | 'temperature': 0.0, 38 | 'do_sample': False, 39 | } 40 | generation_args.update(self.kwargs) 41 | 42 | generate_ids = self.model.generate( 43 | **inputs, 44 | eos_token_id=self.processor.tokenizer.eos_token_id, 45 | **generation_args 46 | ) 47 | generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] 48 | response = self.processor.batch_decode( 49 | generate_ids, 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0] 53 | return response 54 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/qh_360vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import os.path as osp 5 | from PIL import Image 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..utils import DATASET_TYPE 9 | 10 | 11 | class QH_360VL(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='qihoo360/360VL-70B', **kwargs): 17 | assert model_path is not None 18 | self.model_path = model_path 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 20 | self.model = AutoModelForCausalLM.from_pretrained(model_path, 21 | torch_dtype=torch.float16, 22 | low_cpu_mem_usage=True, 23 | device_map='auto', 24 | trust_remote_code=True).eval() 25 | vision_tower = self.model.get_vision_tower() 26 | vision_tower.load_model() 27 | vision_tower.to(device='cuda', dtype=torch.float16) 28 | self.image_processor = vision_tower.image_processor 29 | self.tokenizer.pad_token = self.tokenizer.eos_token 30 | self.kwargs = kwargs 31 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 32 | torch.cuda.empty_cache() 33 | 34 | def generate(self, message, dataset=None): 35 | 36 | prompt, image_path = self.message_to_promptimg(message) 37 | print(prompt) 38 | image = Image.open(image_path).convert('RGB') 39 | terminators = [ 40 | self.tokenizer.convert_tokens_to_ids('<|eot_id|>',) 41 | ] 42 | inputs = self.model.build_conversation_input_ids(self.tokenizer, 43 | query=prompt, 44 | image=image, 45 | image_processor=self.image_processor) 46 | input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True) 47 | images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True) 48 | 49 | output_ids = self.model.generate(input_ids=input_ids, 50 | images=images, 51 | do_sample=False, 52 | num_beams=1, 53 | max_new_tokens=512, 54 | eos_token_id=terminators, 55 | use_cache=True) 56 | 57 | input_token_len = input_ids.shape[1] 58 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 59 | response = outputs.strip() 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/qwen_vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import copy as cp 5 | from .base import BaseModel 6 | from ..smp import isimg, listinstr 7 | from ..utils import DATASET_TYPE 8 | 9 | 10 | class QwenVL(BaseModel): 11 | 12 | INSTALL_REQ = False 13 | INTERLEAVE = True 14 | 15 | def __init__(self, model_path='Qwen/Qwen-VL', **kwargs): 16 | assert model_path is not None 17 | self.model_path = model_path 18 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 19 | tokenizer.padding_side = 'left' 20 | tokenizer.pad_token_id = tokenizer.eod_id 21 | self.tokenizer = tokenizer 22 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 23 | default_kwargs = dict( 24 | do_sample=False, 25 | num_beams=1, 26 | max_new_tokens=512, 27 | min_new_tokens=1, 28 | num_return_sequences=1, 29 | use_cache=True, 30 | output_hidden_states=True, 31 | pad_token_id=tokenizer.eod_id, 32 | eos_token_id=tokenizer.eod_id) 33 | default_kwargs.update(kwargs) 34 | self.kwargs = default_kwargs 35 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 36 | torch.cuda.empty_cache() 37 | 38 | def adjust_kwargs(self, dataset): 39 | kwargs = cp.deepcopy(self.kwargs) 40 | if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 41 | kwargs['max_new_tokens'] = 32 42 | elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset: 43 | kwargs['max_new_tokens'] = 32 44 | elif DATASET_TYPE(dataset) == 'VQA': 45 | if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset): 46 | kwargs['max_new_tokens'] = 100 47 | elif listinstr(['TextVQA'], dataset): 48 | kwargs['max_new_tokens'] = 10 49 | return kwargs 50 | 51 | def generate_inner(self, message, dataset=None): 52 | if dataset is not None: 53 | kwargs = self.adjust_kwargs(dataset) 54 | else: 55 | kwargs = self.kwargs 56 | prompt = '' 57 | for s in message: 58 | if s['type'] == 'image': 59 | prompt += f'{s["value"]}' 60 | elif s['type'] == 'text': 61 | prompt += s['value'] 62 | if dataset is not None and DATASET_TYPE(dataset) == 'VQA': 63 | prompt += ' Answer:' 64 | encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest') 65 | input_ids = encoded.input_ids.to('cuda') 66 | attention_mask = encoded.attention_mask.to('cuda') 67 | 68 | pred = self.model.generate( 69 | input_ids=input_ids, 70 | attention_mask=attention_mask, 71 | **kwargs) 72 | answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip() 73 | return answer 74 | 75 | 76 | class QwenVLChat(BaseModel): 77 | 78 | INSTALL_REQ = False 79 | INTERLEAVE = True 80 | 81 | def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs): 82 | assert model_path is not None 83 | self.model_path = model_path 84 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 85 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 86 | torch.cuda.empty_cache() 87 | self.kwargs = kwargs 88 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 89 | 90 | def generate_inner(self, message, dataset=None): 91 | vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message] 92 | query = self.tokenizer.from_list_format(vl_list) 93 | response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs) 94 | return response 95 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/visualglm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel 3 | from ..smp import * 4 | 5 | 6 | class VisualGLM(BaseModel): 7 | 8 | INSTALL_REQ = False 9 | INTERLEAVE = False 10 | 11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs): 12 | try: 13 | import sat 14 | except: 15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM') 16 | assert model_path is not None 17 | self.model_path = model_path 18 | 19 | from transformers import AutoModel 20 | from transformers import AutoTokenizer 21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() 23 | self.model = model 24 | self.kwargs = kwargs 25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 26 | 27 | def generate_inner(self, message, dataset=None): 28 | prompt, image_path = self.message_to_promptimg(message) 29 | output, _ = self.model.chat( 30 | image_path=image_path, 31 | tokenizer=self.tokenizer, 32 | query=prompt, 33 | history=[], 34 | **self.kwargs 35 | ) 36 | return output 37 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/vxverse.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from .base import BaseModel 6 | from transformers import StoppingCriteriaList 7 | from omegaconf import OmegaConf 8 | from PIL import Image 9 | from huggingface_hub import snapshot_download 10 | from vlmeval.smp import * 11 | 12 | model_cfgs = { 13 | 'XVERSE-V-13B': { 14 | 'arch': 'vxverse', 15 | 'model_type': 'pretrain_xverse13b-chat', 16 | 'max_txt_len': 512, 17 | 'end_sym': '<|endoftext|>', 18 | 'low_resource': False, 19 | 'prompt_template': 'Human: {}\nAssistant: ', 20 | 'ckpt': 'xverse/XVERSE-V-13B', 21 | 'lora_r': 128, 22 | 'lora_alpha': 256, 23 | 'lora_dropout': 0.05, 24 | 'lora_target_modules': 'all_linear', 25 | 'has_qformer': False, 26 | 'n_proj_layers': 2, 27 | 'vit_model': 'openai/clip-vit-large-patch14', 28 | 'vit_path': 'openai/clip-vit-large-patch14', 29 | 'image_size': 224, 30 | 'drop_path_rate': 0, 31 | 'vit_precision': 'fp16', 32 | 'llama_model': 'xverse/XVERSE-13B-Chat', 33 | } 34 | } 35 | 36 | 37 | class VXVERSE(BaseModel): 38 | 39 | INSTALL_REQ = True 40 | INTERLEAVE = False 41 | 42 | def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs): 43 | 44 | if root is None: 45 | warnings.warn('Please set root to the directory of vxverse.') 46 | 47 | if model_name == 'XVERSE-V-13B': 48 | cfg = model_cfgs['XVERSE-V-13B'] 49 | else: 50 | raise NotImplementedError 51 | 52 | ckpt_dir = cfg['ckpt'] 53 | if not osp.isdir(ckpt_dir): 54 | cache_path = get_cache_path(ckpt_dir) 55 | if cache_path is not None: 56 | ckpt_dir = cache_path 57 | else: 58 | ckpt_dir = snapshot_download(repo_id=ckpt_dir) 59 | assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir) 60 | ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin') 61 | cfg['ckpt'] = ckpt 62 | model_cfg = OmegaConf.create(cfg) 63 | 64 | self.model_name = model_name 65 | 66 | self.root = root 67 | sys.path.append(self.root) 68 | 69 | from vxverse.common.registry import registry 70 | from vxverse.conversation.conversation import CONV_VISION_XVERSE 71 | 72 | device = torch.cuda.current_device() 73 | self.device = device 74 | 75 | model_cls = registry.get_model_class(model_cfg.arch) 76 | model = model_cls.from_config(model_cfg) 77 | model = model.to(device) 78 | model.eval() 79 | vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224)) 80 | vis_processor = registry.get_processor_class( 81 | vis_processor_cfg.name 82 | ).from_config(vis_processor_cfg) 83 | 84 | self.model = model 85 | self.vis_processor = vis_processor 86 | self.vis_processor_cfg = vis_processor_cfg 87 | 88 | self.CONV_VISION = CONV_VISION_XVERSE 89 | self.CONV_VISION.system = '' 90 | stop_words_ids = [[835], [2277, 29937]] 91 | self.stop_words_ids = stop_words_ids 92 | default_kwargs = dict(max_new_tokens=512) 93 | default_kwargs.update(kwargs) 94 | self.kwargs = default_kwargs 95 | 96 | def generate_inner(self, message, dataset=None): 97 | prompt, image_path = self.message_to_promptimg(message) 98 | 99 | image = Image.open(image_path).convert('RGB') 100 | image = self.vis_processor(image) 101 | 102 | if self.vis_processor_cfg.name == 'hd_image_train': 103 | patches_per_image = [[image.shape[0]]] 104 | image = [image] 105 | else: 106 | patches_per_image = None 107 | image = image.unsqueeze(0) 108 | 109 | chat_state = self.CONV_VISION.copy() 110 | texts = self.prepare_texts([prompt], chat_state) 111 | texts = [text.lstrip() for text in texts] 112 | answers = self.model.generate( 113 | image, 114 | texts, 115 | patches_per_images=patches_per_image, 116 | do_sample=False, 117 | stop_words_ids=self.stop_words_ids, 118 | **self.kwargs 119 | ) 120 | return answers[0] 121 | 122 | def prepare_texts(self, texts, conv_temp): 123 | convs = [conv_temp.copy() for _ in range(len(texts))] 124 | [ 125 | conv.append_message(conv.roles[0], '\n{}'.format(text)) 126 | for conv, text in zip(convs, texts) 127 | ] 128 | [conv.append_message(conv.roles[1], None) for conv in convs] 129 | texts = [conv.get_prompt() for conv in convs] 130 | return texts 131 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/wemm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import sys 4 | from ..smp import * 5 | from .base import BaseModel 6 | from ..utils import DATASET_TYPE 7 | from transformers import AutoModel, GenerationConfig 8 | 9 | 10 | class WeMM(BaseModel): 11 | def __init__(self, model_path='feipengma/WeMM', **kwargs): 12 | self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True) 13 | self.wemm.cuda() 14 | self.wemm.eval() 15 | torch.cuda.empty_cache() 16 | 17 | def use_custom_prompt(self, dataset): 18 | assert dataset is not None 19 | if DATASET_TYPE(dataset) == 'multi-choice': 20 | return True 21 | return False 22 | 23 | def build_prompt(self, line, dataset=None): 24 | assert self.use_custom_prompt(dataset) 25 | assert dataset is None or isinstance(dataset, str) 26 | tgt_path = self.dump_image(line, dataset) 27 | question = line['question'] 28 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 29 | if hint is not None: 30 | question = hint + '\n' + question 31 | options = { 32 | cand: line[cand] 33 | for cand in string.ascii_uppercase 34 | if cand in line and not pd.isna(line[cand]) 35 | } 36 | for key, item in options.items(): 37 | question += f'\n{key}. {item}' 38 | prompt = question 39 | 40 | if len(options): 41 | prompt += ( 42 | '\n请直接回答选项字母。' if cn_string(prompt) else 43 | "\nAnswer with the option's letter from the given choices directly." 44 | ) 45 | else: 46 | prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' 47 | 48 | message = [dict(type='text', value=prompt)] 49 | message.extend([dict(type='image', value=p) for p in tgt_path]) 50 | return message 51 | 52 | def generate_inner(self, message, dataset=None): 53 | prompt, image_path = self.message_to_promptimg(message) 54 | 55 | if dataset == 'HallusionBench': 56 | prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.' 57 | 58 | gen_config = None 59 | if dataset == 'MMVet': 60 | gen_config = GenerationConfig( 61 | max_new_tokens=512, 62 | do_sample=True, 63 | temperatures=0.7, 64 | num_beams=3, 65 | eos_token_id=self.wemm.tokenizer.eos_token_id, 66 | pad_token_id=self.wemm.tokenizer.pad_token_id 67 | if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id, 68 | ) 69 | pred = self.wemm.mm_generate(image_path, prompt, gen_config) 70 | 71 | return pred 72 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sharecaptioner import ShareCaptioner 2 | from .xcomposer import XComposer 3 | from .xcomposer2 import XComposer2 4 | from .xcomposer2_4KHD import XComposer2_4KHD 5 | 6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD'] 7 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/sharecaptioner.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/sharecaptioner.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2_4KHD.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/VLMEvalKit/vlmeval/vlm/xcomposer/__pycache__/xcomposer2_4KHD.cpython-39.pyc -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/sharecaptioner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from ..base import BaseModel 4 | from ...smp import * 5 | from ...utils import DATASET_TYPE 6 | 7 | 8 | class ShareCaptioner(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs): 14 | assert model_path is not None 15 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 16 | self.model = AutoModelForCausalLM.from_pretrained( 17 | model_path, device_map='cuda', trust_remote_code=True).eval() 18 | self.model.tokenizer = tokenizer 19 | self.model.cuda() 20 | self.model.half() 21 | 22 | def use_custom_prompt(self, dataset): 23 | assert dataset is not None 24 | if DATASET_TYPE(dataset) == 'multi-choice': 25 | return True 26 | return False 27 | 28 | def build_prompt(self, line, dataset=None): 29 | assert dataset is None or isinstance(dataset, str) 30 | assert self.use_custom_prompt(dataset) 31 | tgt_path = self.dump_image(line, dataset) 32 | 33 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 34 | question = line['question'] 35 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 36 | if hint is not None: 37 | question = hint + '\n' + question 38 | 39 | option_candidate = string.ascii_uppercase 40 | options = { 41 | cand: line[cand] 42 | for cand in option_candidate 43 | if cand in line and not pd.isna(line[cand]) 44 | } 45 | for key, item in options.items(): 46 | question += f'\n{key}. {item}' 47 | prompt = question 48 | 49 | if not cn_string(prompt): 50 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." 51 | else: 52 | prompt = prompt + '\n' + '请直接回答选项字母。' 53 | else: 54 | prompt = line['question'] 55 | message = [dict(type='text', value=prompt)] 56 | message.extend([dict(type='image', value=s) for s in tgt_path]) 57 | return message 58 | 59 | def generate_inner(self, message, dataset=None): 60 | prompt, image_path = self.message_to_promptimg(message) 61 | seg1 = '<|User|>:' 62 | seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:' 63 | self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True) 64 | self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False) 65 | 66 | image = Image.open(image_path).convert('RGB') 67 | image = self.model.vis_processor(image).unsqueeze(0) 68 | image = image.to(self.model.device) 69 | tmp_bs = image.shape[0] 70 | tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1) 71 | tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1) 72 | with torch.cuda.amp.autocast(): 73 | with torch.no_grad(): 74 | image = self.model.encode_img(image) 75 | input_emb = torch.cat( 76 | [tmp_seg_emb1, image, tmp_seg_emb2], dim=1) 77 | out_embeds = self.model.internlm_model.generate( 78 | inputs_embeds=input_emb, 79 | max_length=500, 80 | num_beams=3, 81 | min_length=1, 82 | do_sample=True, 83 | repetition_penalty=1.5, 84 | length_penalty=1.0, 85 | temperature=1., 86 | eos_token_id=self.model.tokenizer.eos_token_id, 87 | num_return_sequences=1) 88 | 89 | for j, out in enumerate(out_embeds): 90 | out[out == -1] = 2 91 | response = self.model.decode_text([out]) 92 | return response 93 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/xcomposer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModel, AutoTokenizer 3 | from transformers import StoppingCriteria, StoppingCriteriaList 4 | from PIL import Image 5 | from ..base import BaseModel 6 | from ...smp import * 7 | 8 | 9 | class StoppingCriteriaSub(StoppingCriteria): 10 | def __init__(self, stops=[], encounters=1): 11 | super().__init__() 12 | self.stops = stops 13 | 14 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 15 | for stop in self.stops: 16 | if torch.all((stop == input_ids[0][-len(stop):])).item(): 17 | return True 18 | 19 | return False 20 | 21 | 22 | from ...utils import DATASET_TYPE 23 | 24 | 25 | class XComposer(BaseModel): 26 | 27 | INSTALL_REQ = False 28 | INTERLEAVE = False 29 | 30 | def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs): 31 | assert model_path is not None 32 | self.model_path = model_path 33 | 34 | model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval() 35 | tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) 36 | model.tokenizer = tokenizer 37 | self.model = model 38 | self.device = self.model.internlm_model.model.embed_tokens.weight.device 39 | self.eoh = '' 40 | self.eoa = '' 41 | stop_words_ids = [ 42 | torch.tensor([103027]).to(self.device), # end of human 43 | torch.tensor([103028]).to(self.device), # end of bot 44 | ] 45 | default_kwargs = { 46 | 'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False, 47 | 'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0 48 | } 49 | default_kwargs.update(kwargs) 50 | self.kwargs = default_kwargs 51 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 52 | 53 | def generate_inner(self, message, dataset=None): 54 | if len(message) == 2: 55 | if message[0]['type'] == 'text' and message[1]['type'] == 'image': 56 | message = [message[1], message[0]] 57 | kwargs = cp.deepcopy(self.kwargs) 58 | if dataset is not None: 59 | if DATASET_TYPE(dataset) == 'multi-choice': 60 | kwargs['max_new_tokens'] = 5 61 | kwargs['num_beams'] = 5 62 | 63 | with torch.cuda.amp.autocast(): 64 | with torch.no_grad(): 65 | prompt_embs = self.message_to_prompt_embs(message, dataset) 66 | outputs = self.model.internlm_model.generate( 67 | inputs_embeds=prompt_embs, 68 | stopping_criteria=self.stopping_criteria, 69 | **kwargs 70 | ) 71 | 72 | output_token = outputs[0] 73 | if output_token[0] == 0: 74 | output_token = output_token[1:] 75 | if output_token[0] == 1: 76 | output_token = output_token[1:] 77 | output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False) 78 | 79 | output_text = output_text.split(self.model.eoa)[0] 80 | output_text = output_text.split('<|Bot|>')[-1].strip() 81 | return output_text 82 | 83 | def message_to_prompt_embs(self, message, dataset=None): 84 | assert isinstance(message, list) 85 | img_embeds = [] 86 | prompt_full = '<|User|>: ' 87 | for msg in message: 88 | if msg['type'] == 'text': 89 | prompt_full += msg['value'] 90 | elif msg['type'] == 'image': 91 | image = Image.open(msg['value']).convert('RGB') 92 | image = self.model.vis_processor(image).unsqueeze(0).to(self.device) 93 | img_embeds.append(self.model.encode_img(image)) 94 | prompt_full += '' 95 | 96 | prompt_full += self.model.eoh + ' <|Bot|>: ' 97 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 98 | prompt_full += 'Answer: The answer is ' 99 | elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']: 100 | prompt_full += 'Answer: ' 101 | 102 | prompt_segs = prompt_full.split('') 103 | assert len(prompt_segs) == len(img_embeds) + 1 104 | 105 | prompt_seg_tokens = [ 106 | self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long() 107 | for i, seg in enumerate(prompt_segs) 108 | ] 109 | prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens] 110 | all_embeddings = [] 111 | for i in range(len(img_embeds)): 112 | all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]]) 113 | all_embeddings.append(prompt_seg_embs[-1]) 114 | prompt_embs = torch.cat(all_embeddings, dim=1) 115 | return prompt_embs 116 | 117 | def use_custom_prompt(self, dataset): 118 | assert dataset is not None 119 | if DATASET_TYPE(dataset) == 'multi-choice': 120 | return True 121 | return False 122 | 123 | def build_prompt(self, line, dataset=None): 124 | assert dataset is None or isinstance(dataset, str) 125 | assert self.use_custom_prompt(dataset) 126 | tgt_path = self.dump_image(line, dataset) 127 | 128 | question = line['question'] 129 | options = { 130 | cand: line[cand] 131 | for cand in string.ascii_uppercase 132 | if cand in line and not pd.isna(line[cand]) 133 | } 134 | options_prompt = '' 135 | for key, item in options.items(): 136 | options_prompt += f'{key}. {item}\n' 137 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 138 | context = 'N/A' if hint is None else hint 139 | mid_prompt = 'Context: ' + context + '\nQuestion: ' + question 140 | if len(options_prompt): 141 | mid_prompt += '\nOptions: ' + options_prompt 142 | 143 | if len(options): 144 | txt_prompt = 'Please answer this question by choosing the correct choice.' 145 | else: 146 | txt_prompt = 'Please answer this question directly. ' 147 | prompt = txt_prompt + mid_prompt 148 | message = [dict(type='text', value=prompt)] 149 | message.extend([dict(type='image', value=s) for s in tgt_path]) 150 | return message 151 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/yi_vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from PIL import Image 6 | from vlmeval.smp import get_cache_path, load, dump, splitlen 7 | from huggingface_hub import snapshot_download 8 | from .base import BaseModel 9 | 10 | 11 | """ 12 | You can perform inference of Yi-VL through the following steps: 13 | 1. clone the repo https://github.com/01-ai/Yi to path-to-Yi 14 | 2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt 15 | 3. set Yi_ROOT in vlmeval/config.py 16 | Yi_ROOT = path-to-Yi 17 | 18 | You are all set now! To run a demo for Yi-VL: 19 | ```python 20 | from vlmeval import * 21 | model = supported_VLM['Yi_VL_6B']() 22 | model.generate('apple.jpg', 'What is in this image?') 23 | ``` 24 | To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}` 25 | """ 26 | 27 | 28 | def edit_config(repo_id): 29 | if not osp.exists(repo_id): 30 | root = get_cache_path(repo_id) 31 | else: 32 | root = repo_id 33 | assert root is not None and osp.exists(root) 34 | cfg = osp.join(root, 'config.json') 35 | data = load(cfg) 36 | mm_vision_tower = data['mm_vision_tower'] 37 | if mm_vision_tower.startswith('./vit/'): 38 | data['mm_vision_tower'] = osp.join(root, mm_vision_tower) 39 | assert osp.exists(data['mm_vision_tower']) 40 | dump(data, cfg) 41 | 42 | 43 | def disable_torch_init(): 44 | """ 45 | Disable the redundant torch default initialization to accelerate model creation. 46 | """ 47 | import torch 48 | 49 | setattr(torch.nn.Linear, 'reset_parameters', lambda self: None) 50 | setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None) 51 | 52 | 53 | class Yi_VL(BaseModel): 54 | 55 | INSTALL_REQ = True 56 | INTERLEAVE = False 57 | 58 | def __init__(self, 59 | model_path='01-ai/Yi-VL-6B', 60 | root=None, 61 | **kwargs): 62 | 63 | if root is None: 64 | warnings.warn( 65 | 'Please set root to the directory of Yi, ' 66 | 'which is cloned from here: https://github.com/01-ai/Yi.' 67 | ) 68 | 69 | self.root = osp.join(root, 'VL') 70 | sys.path.append(self.root) 71 | 72 | if splitlen(model_path, '/') == 2 and not osp.exists(model_path): 73 | if get_cache_path(model_path) is None: 74 | snapshot_download(repo_id=model_path) 75 | edit_config(model_path) 76 | elif osp.exists(model_path): 77 | edit_config(model_path) 78 | 79 | from llava.mm_utils import get_model_name_from_path, load_pretrained_model 80 | from llava.model.constants import key_info 81 | 82 | disable_torch_init() 83 | key_info['model_path'] = model_path 84 | get_model_name_from_path(model_path) 85 | self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model( 86 | model_path, 87 | device_map='cpu') 88 | self.model = self.model.cuda() 89 | self.conv_mode = 'mm_default' 90 | 91 | kwargs_default = dict(temperature=0.2, 92 | num_beams=1, 93 | do_sample=False, 94 | max_new_tokens=1024, 95 | top_p=None) 96 | kwargs_default.update(kwargs) 97 | self.kwargs = kwargs_default 98 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 99 | 100 | def generate_inner(self, message, dataset=None): 101 | prompt, image_path = self.message_to_promptimg(message) 102 | 103 | from llava.conversation import conv_templates 104 | from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX 105 | from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token 106 | 107 | qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt 108 | conv = conv_templates[self.conv_mode].copy() 109 | conv.append_message(conv.roles[0], qs) 110 | conv.append_message(conv.roles[1], None) 111 | prompt = conv.get_prompt() 112 | 113 | input_ids = ( 114 | tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') 115 | .unsqueeze(0) 116 | .cuda() 117 | ) 118 | 119 | image = Image.open(image_path) 120 | if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad': 121 | if image.mode == 'L': 122 | background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3) 123 | else: 124 | background_color = tuple(int(x * 255) for x in self.image_processor.image_mean) 125 | image = expand2square(image, background_color) 126 | image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[ 127 | 'pixel_values' 128 | ][0] 129 | 130 | stop_str = conv.sep 131 | keywords = [stop_str] 132 | stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) 133 | self.model = self.model.to(dtype=torch.bfloat16) 134 | with torch.inference_mode(): 135 | output_ids = self.model.generate( 136 | input_ids, 137 | images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(), 138 | stopping_criteria=[stopping_criteria], 139 | use_cache=True, 140 | **self.kwargs) 141 | 142 | input_token_len = input_ids.shape[1] 143 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 144 | if n_diff_input_output > 0: 145 | print( 146 | f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids' 147 | ) 148 | outputs = self.tokenizer.batch_decode( 149 | output_ids[:, input_token_len:], skip_special_tokens=True 150 | )[0] 151 | outputs = outputs.strip() 152 | 153 | if outputs.endswith(stop_str): 154 | outputs = outputs[: -len(stop_str)] 155 | outputs = outputs.strip() 156 | return outputs 157 | -------------------------------------------------------------------------------- /assets/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/assets/overview.jpg -------------------------------------------------------------------------------- /assets/taskmap.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMIU/642001df618a57d65869ef3975021deabfbcc891/assets/taskmap.jpg -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import base64 4 | import time 5 | from openai import OpenAI 6 | from multiprocessing import Pool 7 | import re 8 | 9 | def remove_punctuation(text): 10 | return re.sub(r'^[.,()]+|[.,()]+$', '', text) 11 | 12 | client = OpenAI( 13 | base_url='xx', 14 | api_key='xx', 15 | ) 16 | 17 | def build_prompt(question, options, prediction): 18 | tmpl = ( 19 | "You are an AI assistant who will help me to match an answer with several options of a single-choice question. " 20 | "You are provided with a question, several options, and an answer, and you need to find which option is most similar to the answer. " 21 | "If the meaning of all options are significantly different from the answer, output Z. " 22 | "When the options are mostly numbers, if the model outputs numbers in the same format, please do not be too precise and try to match an answer as much as possible. "\ 23 | "Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n" 24 | "Example 1: \n" 25 | "Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\nAnswer: a cute teddy bear\nYour output: A\n" 26 | "Example 2: \n" 27 | "Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\nAnswer: Spider\nYour output: Z\n" 28 | "Example 3: \n" 29 | "Question: {}?\nOptions: {}\nAnswer: {}\nYour output: " 30 | ) 31 | return tmpl.format(question, options, prediction) 32 | 33 | 34 | def process_data(args): 35 | data_tmp, modelname = args 36 | client = OpenAI( 37 | # base_url='https://kkkc.net/v1', 38 | # api_key='sk-YJaHfazVSf2WDkAl1bAdE17bF3Ae4923Ba888293B31d13C4', 39 | base_url='xx', 40 | api_key='xx', 41 | ) 42 | 43 | options = data_tmp['options'] 44 | question = data_tmp['question'] 45 | prediction = data_tmp[modelname].strip() 46 | 47 | if modelname == 'Claude3' and "copyrighted material" in prediction: 48 | data_tmp[f'{modelname}_choice'] = 'Z' 49 | return data_tmp 50 | if prediction == 'image none' or prediction == 'model error or image error' or prediction == 'image error' or prediction == 'model error' or prediction == "": 51 | data_tmp[f'{modelname}_choice'] = 'Z' 52 | return data_tmp 53 | if '\u00a0' in prediction: 54 | prediction = prediction.replace('\u00a0','') 55 | 56 | 57 | prediction = remove_punctuation(prediction.strip()) 58 | 59 | if prediction.strip().lower() not in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n']: 60 | 61 | 62 | 63 | content = build_prompt(question,options,prediction) 64 | 65 | try: 66 | response = client.chat.completions.create( 67 | model="gpt-4o-mini", 68 | messages=[ 69 | { 70 | "role": "user", 71 | "content": [ 72 | {"type": "text", "text": content}, 73 | ], 74 | } 75 | ], 76 | max_tokens=512, 77 | ) 78 | # print(response.choices[0].message.content) 79 | grading = response.choices[0].message.content 80 | 81 | except Exception as e: 82 | print('errror: ', e) 83 | # grading = str(e) 84 | grading = 'GPT error' 85 | 86 | 87 | data_tmp[f'{modelname}_choice'] = grading.strip() 88 | print(modelname,': ',data_tmp[f'{modelname}_choice']) 89 | return data_tmp 90 | else: 91 | data_tmp[f'{modelname}_choice'] = prediction.strip() 92 | print(modelname,': ',data_tmp[f'{modelname}_choice']) 93 | return data_tmp 94 | 95 | 96 | 97 | def main(): 98 | # modelnames = ['internvl1.5-chat'] 99 | # modelnames = ['Gemini','Gemini1.0'] 100 | # modelnames = ['GPT4o','Gemini','Gemini1.0'] 101 | # modelnames = ['Llava-interleave'] 102 | modelnames = ['Llava-interleave', 'qwen_chat', 'XComposer2', 'deepseek_vl_7b', 'qwen_base', 'XComposer2_1.8b', 'flamingov2', 'deepseek_vl_1.3b', 'internvl1.5-chat', 'idefics2_8b', 'Mantis', 'idefics_9b_instruct'] 103 | directorys = ['xx','xx'] 104 | 105 | for directory in directorys: 106 | tasknames = os.listdir(directory) 107 | for taskname in tasknames: 108 | 109 | path = os.path.join(directory,taskname) 110 | for modelname in modelnames: 111 | path = os.path.join(directory,taskname) 112 | path = os.path.join(path,modelname) 113 | 114 | print(taskname,modelname) 115 | json_path = os.path.join(path,'metadata_info.json') 116 | 117 | 118 | 119 | if not os.path.exists(json_path): 120 | print(json_path,' not exist') 121 | continue 122 | 123 | # output_json_path = os.path.join(path,'metadata_info_choice.json') 124 | output_json_path = os.path.join(path,'metadata_info_choice.json') 125 | # if os.path.exists(output_json_path) or os.path.exists(output_json_path1): 126 | if os.path.exists(output_json_path): 127 | print(output_json_path, ' already have') 128 | continue 129 | 130 | with open(json_path,'r') as f: 131 | data = json.load(f) 132 | 133 | # 将data和modelname打包成元组列表 134 | data_with_modelname = [(data_tmp, modelname) for data_tmp in data] 135 | 136 | 137 | 138 | pool = Pool(processes=10) # Adjust the number of processes as per your machine's capability 139 | # result = pool.map(process_data, data, modelname) 140 | # 使用map方法传递打包后的元组列表 141 | result = pool.map(process_data, data_with_modelname) 142 | 143 | # output_json_path = os.path.join(path,'metadata_info_choice.json') 144 | 145 | with open(output_json_path, 'w') as f: 146 | json.dump(result, f) 147 | 148 | print(taskname,modelname,'OK') 149 | 150 | 151 | 152 | if __name__ == '__main__': 153 | main() 154 | 155 | -------------------------------------------------------------------------------- /evaluate_correct.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pandas as pd 4 | 5 | directorys = [ 6 | 'xx' 7 | ] 8 | 9 | 10 | # Initialize global DataFrames to store data 11 | global_accuracy_df = pd.DataFrame() 12 | 13 | for directory in directorys: 14 | tasknames = sorted(os.listdir(directory)) 15 | 16 | modelnames = ['GPT4o','Claude3','Gemini','Gemini1.0','Llava-interleave','Mantis','InternVL2','internvl1.5-chat','qwen_chat', 'qwen_base', 'idefics_9b_instruct','flamingov2', 'deepseek_vl_1.3b', 'XComposer2_1.8b', 'deepseek_vl_7b', 'idefics2_8b', 'XComposer2'] 17 | # modelnames = ['Llava-interleave'] 18 | # Initialize dictionaries to store data 19 | accuracy_data = {modelname: [] for modelname in modelnames} 20 | 21 | for taskname in tasknames: 22 | path = os.path.join(directory, taskname) 23 | for modelname in modelnames: 24 | json_path = os.path.join(path, modelname, 'metadata_info_choice.json') 25 | 26 | if os.path.exists(json_path): 27 | with open(json_path, 'r') as f: 28 | data = json.load(f) 29 | else: 30 | print('no json: ', taskname,modelname) 31 | accuracy_data[modelname].append(None) 32 | continue 33 | 34 | 35 | 36 | cnt = 0 37 | correct = 0 38 | cnt_z = 0 39 | 40 | for i in range(len(data)): 41 | data_tmp = data[i] 42 | flag = True 43 | if data_tmp[f'{modelname}_choice'].strip() == 'GPT error': 44 | print(modelname, taskname, 'GPT error') 45 | continue 46 | 47 | if data_tmp["output"] == None: 48 | flag = False 49 | continue 50 | gt = data_tmp["output"].strip().lower() 51 | 52 | if flag == False: 53 | continue 54 | 55 | cnt += 1 56 | 57 | if data_tmp[f'{modelname}_choice'].strip().lower() in gt: 58 | correct += 1 59 | 60 | 61 | accuracy_data[modelname].append(correct / cnt) 62 | print(correct / cnt, taskname, modelname) 63 | 64 | 65 | 66 | # Convert dictionaries to DataFrames 67 | accuracy_df = pd.DataFrame(accuracy_data, index=tasknames) 68 | 69 | # Append to global DataFrames 70 | global_accuracy_df = pd.concat([global_accuracy_df, accuracy_df]) 71 | 72 | # Calculate the overall average for each model 73 | global_accuracy_df.loc['Overall'] = global_accuracy_df.mean() 74 | 75 | # Save global DataFrames to CSV files 76 | global_accuracy_df.to_csv('./Accuracy_data_all.csv') 77 | 78 | print("Global DataFrames have been saved as CSV files.") 79 | 80 | 81 | --------------------------------------------------------------------------------