├── .github └── workflows │ └── lint.yml ├── .gitignore ├── .vscode └── launch.json ├── Quickstart.md ├── README.md ├── VLMEvalKit-main ├── .github │ └── workflows │ │ └── lint.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets │ ├── LOGO.svg │ └── apple.jpg ├── docs │ ├── en │ │ ├── Development.md │ │ └── Quickstart.md │ ├── ja │ │ └── README_ja.md │ └── zh-CN │ │ ├── Development_zh-CN.md │ │ ├── Quickstart_zh-CN.md │ │ └── README_zh-CN.md ├── requirements.txt ├── run.py ├── scripts │ ├── AI2D_preproc.ipynb │ ├── apires_scan.py │ ├── auto_run.py │ ├── cover.sh │ ├── mmb_eval_gradio.py │ ├── run.sh │ ├── srun.sh │ ├── summarize.py │ └── visualize.ipynb ├── setup.py └── vlmeval │ ├── __init__.py │ ├── api │ ├── __init__.py │ ├── base.py │ ├── claude.py │ ├── cloudwalk.py │ ├── gemini.py │ ├── glm_vision.py │ ├── gpt.py │ ├── gpt_int.py │ ├── hf_chat_model.py │ ├── qwen_api.py │ ├── qwen_vl_api.py │ ├── reka.py │ └── stepai.py │ ├── config.py │ ├── evaluate │ ├── OCRBench.py │ ├── __init__.py │ ├── coco_eval.py │ ├── llavabench.py │ ├── mathvista_eval.py │ ├── misc.py │ ├── mmvet_eval.py │ ├── multiple_choice.py │ ├── vqa_eval.py │ └── yes_or_no.py │ ├── inference.py │ ├── smp │ ├── __init__.py │ ├── file.py │ ├── log.py │ ├── misc.py │ └── vlm.py │ ├── tools.py │ ├── utils │ ├── __init__.py │ ├── custom_prompt.py │ ├── dataset.py │ ├── dataset_config.py │ ├── matching_util.py │ ├── mp_util.py │ └── result_transfer.py │ └── vlm │ ├── __init__.py │ ├── base.py │ ├── bunnyllama3.py │ ├── cogvlm.py │ ├── deepseek_vl.py │ ├── emu.py │ ├── idefics.py │ ├── instructblip.py │ ├── internvl_chat.py │ ├── llava │ ├── __init__.py │ ├── llava.py │ └── llava_xtuner.py │ ├── mgm.py │ ├── minicpm_v.py │ ├── minigpt4.py │ ├── misc │ ├── blip2_instruct_vicuna13b.yaml │ ├── blip2_instruct_vicuna7b.yaml │ ├── minigpt4_13b_eval.yaml │ ├── minigpt4_7b_eval.yaml │ └── minigptv2_eval.yaml │ ├── mmalaya.py │ ├── monkey.py │ ├── mplug_owl2.py │ ├── omnilmm.py │ ├── open_flamingo.py │ ├── paligemma.py │ ├── pandagpt.py │ ├── phi3_vision.py │ ├── qh_360vl.py │ ├── qwen_vl.py │ ├── transcore_m.py │ ├── visualglm.py │ ├── vxverse.py │ ├── wemm.py │ ├── xcomposer │ ├── __init__.py │ ├── sharecaptioner.py │ ├── xcomposer.py │ ├── xcomposer2.py │ └── xcomposer2_4KHD.py │ └── yi_vl.py ├── assets ├── LOGO.svg ├── apple.jpg ├── metatask_eval.png ├── overall_progress.png └── overview.jpg ├── requirements.txt ├── run.py ├── setup.py └── vlmeval ├── __init__.py ├── api ├── __init__.py ├── base.py ├── claude.py ├── cloudwalk.py ├── gemini.py ├── glm_vision.py ├── gpt.py ├── gpt_int.py ├── hf_chat_model.py ├── qwen_api.py ├── qwen_vl_api.py ├── reka.py └── stepai.py ├── config.py ├── evaluate ├── OCRBench.py ├── __init__.py ├── coco_eval.py ├── llavabench.py ├── mathvista_eval.py ├── misc.py ├── mmvet_eval.py ├── multiple_choice.py ├── vqa_eval.py └── yes_or_no.py ├── inference.py ├── smp ├── __init__.py ├── file.py ├── log.py ├── misc.py └── vlm.py ├── tools.py ├── utils ├── __init__.py ├── custom_prompt.py ├── dataset.py ├── dataset_config.py ├── matching_util.py ├── mp_util.py └── result_transfer.py └── vlm ├── __init__.py ├── base.py ├── bunnyllama3.py ├── cogvlm.py ├── deepseek_vl.py ├── emu.py ├── idefics.py ├── instructblip.py ├── internvl_chat.py ├── llava ├── __init__.py ├── llava.py └── llava_xtuner.py ├── mgm.py ├── minicpm_v.py ├── minigpt4.py ├── misc ├── blip2_instruct_vicuna13b.yaml ├── blip2_instruct_vicuna7b.yaml ├── minigpt4_13b_eval.yaml ├── minigpt4_7b_eval.yaml └── minigptv2_eval.yaml ├── mmalaya.py ├── monkey.py ├── mplug_owl2.py ├── omnilmm.py ├── open_flamingo.py ├── paligemma.py ├── pandagpt.py ├── phi3_vision.py ├── qh_360vl.py ├── qwen_vl.py ├── transcore_m.py ├── visualglm.py ├── vxverse.py ├── wemm.py ├── xcomposer ├── __init__.py ├── sharecaptioner.py ├── xcomposer.py ├── xcomposer2.py └── xcomposer2_4KHD.py └── yi_vl.py /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.7 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.7 18 | - name: Install pre-commit hook 19 | run: | 20 | pip install pre-commit 21 | pre-commit install 22 | - name: Linting 23 | run: pre-commit run --all-files 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # Images 156 | images/ 157 | 158 | scripts/*ttf 159 | 160 | lvlm_zoo/ 161 | LMUData/ 162 | work_dirs 163 | 164 | batchscript-* 165 | phoenix-slurm-* 166 | LMUData 167 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python Debugger: Current File", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal" 13 | }, 14 | { 15 | "name": "main", 16 | "type": "python", 17 | "request": "launch", 18 | "program": "run.py", 19 | "console": "integratedTerminal", 20 | "justMyCode": true, 21 | "args": [ 22 | "--data", "MMT-Bench_ALL", 23 | "--model", "llava_v1.5_7b", 24 | "--work-dir", "work_dirs/mmtbench" 25 | ] 26 | }, 27 | ] 28 | } -------------------------------------------------------------------------------- /VLMEvalKit-main/.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 3.7 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: 3.7 18 | - name: Install pre-commit hook 19 | run: | 20 | pip install pre-commit 21 | pre-commit install 22 | - name: Linting 23 | run: pre-commit run --all-files 24 | -------------------------------------------------------------------------------- /VLMEvalKit-main/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # Images 156 | images/ 157 | 158 | scripts/*ttf 159 | -------------------------------------------------------------------------------- /VLMEvalKit-main/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: | 2 | (?x)^( 3 | scripts/| 4 | assets/| 5 | vlmeval/config.py 6 | ) 7 | repos: 8 | - repo: https://github.com/PyCQA/flake8 9 | rev: 5.0.4 10 | hooks: 11 | - id: flake8 12 | args: ["--max-line-length=120", "--ignore=F401,F403,F405,E402,E722,E741,W503"] 13 | exclude: ^configs/ 14 | - repo: https://github.com/pre-commit/mirrors-yapf 15 | rev: v0.30.0 16 | hooks: 17 | - id: yapf 18 | args: ["--style={column_limit=120}"] 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v3.1.0 21 | hooks: 22 | - id: trailing-whitespace 23 | - id: check-yaml 24 | - id: end-of-file-fixer 25 | - id: requirements-txt-fixer 26 | - id: double-quote-string-fixer 27 | - id: check-merge-conflict 28 | - id: fix-encoding-pragma 29 | args: ["--remove"] 30 | - id: mixed-line-ending 31 | args: ["--fix=lf"] 32 | -------------------------------------------------------------------------------- /VLMEvalKit-main/assets/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/VLMEvalKit-main/assets/apple.jpg -------------------------------------------------------------------------------- /VLMEvalKit-main/requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | gradio==4.15.0 3 | huggingface_hub 4 | matplotlib 5 | numpy>=1.23.4 6 | omegaconf 7 | openai==1.3.5 8 | opencv-python>=4.4.0.46 9 | openpyxl 10 | pandas>=1.5.3 11 | pillow 12 | portalocker 13 | protobuf 14 | pycocoevalcap 15 | python-dotenv 16 | requests 17 | rich 18 | seaborn 19 | sentencepiece 20 | sty 21 | tabulate 22 | tiktoken 23 | timeout-decorator 24 | torch>=2.0.1 25 | tqdm 26 | transformers 27 | typing_extensions==4.7.1 28 | validators 29 | visual_genome 30 | xlsxwriter 31 | xtuner 32 | -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/apires_scan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from vlmeval import * 3 | FAIL_MSG = 'Failed to obtain answer via API.' 4 | 5 | root = sys.argv[1] 6 | if root[-1] in '/\\': 7 | root = root[:-1] 8 | 9 | model_name = root.split('/')[-1] 10 | datasets = list(dataset_URLs) 11 | 12 | for d in datasets: 13 | fname = f'{model_name}_{d}.xlsx' 14 | pth = osp.join(root, fname) 15 | if osp.exists(pth): 16 | data = load(pth) 17 | # Detect Failure 18 | assert 'prediction' in data 19 | data['prediction'] = [str(x) for x in data['prediction']] 20 | fail = [FAIL_MSG in x for x in data['prediction']] 21 | if sum(fail): 22 | nfail = sum(fail) 23 | ntot = len(fail) 24 | print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ') 25 | 26 | eval_files = ls(root, match=f'{model_name}_{d}_') 27 | eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')] 28 | 29 | if len(eval_files) == 0: 30 | print(f'Model {model_name} x Dataset {d} openai missing') 31 | continue 32 | 33 | assert len(eval_files) == 1 34 | eval_file = eval_files[0] 35 | data = load(eval_file) 36 | 37 | if 'MMVet' in d: 38 | bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)] 39 | if len(bad): 40 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 41 | elif 'MathVista' in d: 42 | bad = [x for x in data['res'] if FAIL_MSG in str(x)] 43 | if len(bad): 44 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 45 | 46 | elif d == 'LLaVABench': 47 | sub = data[data['gpt4_score'] == -1] 48 | sub = sub[sub['gpt4_score'] == -1] 49 | if len(sub): 50 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.') 51 | else: 52 | bad = [x for x in data['log'] if FAIL_MSG in str(x)] 53 | if len(bad): 54 | print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') 55 | -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/auto_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from vlmeval.smp import * 3 | from vlmeval.config import supported_VLM 4 | 5 | def is_api(x): 6 | return getattr(supported_VLM[x].func, 'is_api', False) 7 | 8 | models = list(supported_VLM) 9 | models = [x for x in models if 'fs' not in x] 10 | models = [x for x in models if not is_api(x)] 11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2'] 12 | models = [x for x in models if x not in exclude_list] 13 | 14 | def is_large(x): 15 | return '80b' in x or 'emu2' in x or '34B' in x 16 | 17 | small_models = [x for x in models if not is_large(x)] 18 | large_models = [x for x in models if is_large(x)] 19 | models = small_models + large_models 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--data', type=str, nargs='+', required=True) 23 | args = parser.parse_args() 24 | 25 | # Skip some models 26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] 27 | 28 | for m in models: 29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')] 30 | if len(unknown_datasets) == 0: 31 | continue 32 | dataset_str = ' '.join(unknown_datasets) 33 | if '80b' in m: 34 | cmd = f'python run.py --data {dataset_str} --model {m}' 35 | else: 36 | cmd = f'bash run.sh --data {dataset_str} --model {m}' 37 | print(cmd) 38 | os.system(cmd) -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/cover.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | cp $DIR/../config.py $DIR/../vlmeval/ 4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/ -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | export GPU=$(nvidia-smi --list-gpus | wc -l) 4 | torchrun --nproc-per-node=$GPU run.py ${@:1} -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2} -------------------------------------------------------------------------------- /VLMEvalKit-main/scripts/summarize.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.utils.dataset_config import dataset_URLs 3 | 4 | def get_score(model, dataset): 5 | 6 | file_name = f'{model}/{model}_{dataset}' 7 | if listinstr([ 8 | 'CCBench', 'MMBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset): 9 | file_name += '_acc.csv' 10 | elif listinstr(['MME', 'Hallusion', 'LLaVABench'], dataset): 11 | file_name += '_score.csv' 12 | elif listinstr(['MMVet', 'MathVista'], dataset): 13 | file_name += '_gpt-4-turbo_score.csv' 14 | elif listinstr(['COCO', 'OCRBench'], dataset): 15 | file_name += '_score.json' 16 | else: 17 | raise NotImplementedError 18 | 19 | if not osp.exists(file_name): 20 | return {} 21 | 22 | data = load(file_name) 23 | ret = {} 24 | if dataset == 'CCBench': 25 | ret[dataset] = data['Overall'][0] * 100 26 | elif dataset == 'MMBench': 27 | for n, a in zip(data['split'], data['Overall']): 28 | if n == 'dev': 29 | ret['MMBench_DEV_EN'] = a * 100 30 | elif n == 'test': 31 | ret['MMBench_TEST_EN'] = a * 100 32 | elif dataset == 'MMBench_CN': 33 | for n, a in zip(data['split'], data['Overall']): 34 | if n == 'dev': 35 | ret['MMBench_DEV_CN'] = a * 100 36 | elif n == 'test': 37 | ret['MMBench_TEST_CN'] = a * 100 38 | elif listinstr(['SEEDBench', 'ScienceQA', 'MMBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset): 39 | ret[dataset] = data['Overall'][0] * 100 40 | elif 'MME' == dataset: 41 | ret[dataset] = data['perception'][0] + data['reasoning'][0] 42 | elif 'MMVet' == dataset: 43 | data = data[data['Category'] == 'Overall'] 44 | ret[dataset] = float(data.iloc[0]['acc']) 45 | elif 'HallusionBench' == dataset: 46 | data = data[data['split'] == 'Overall'] 47 | for met in ['aAcc', 'qAcc', 'fAcc']: 48 | ret[dataset + f' ({met})'] = float(data.iloc[0][met]) 49 | elif 'MMMU' in dataset: 50 | data = data[data['split'] == 'validation'] 51 | ret['MMMU (val)'] = float(data.iloc[0]['Overall']) * 100 52 | elif 'MathVista' in dataset: 53 | data = data[data['Task&Skill'] == 'Overall'] 54 | ret[dataset] = float(data.iloc[0]['acc']) 55 | elif 'LLaVABench' in dataset: 56 | data = data[data['split'] == 'overall'].iloc[0] 57 | ret[dataset] = float(data['Relative Score (main)']) 58 | elif 'OCRBench' in dataset: 59 | ret[dataset] = data['Final Score'] 60 | 61 | return ret 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument('--data', type=str, nargs='+', default=[]) 66 | parser.add_argument("--model", type=str, nargs='+', required=True) 67 | args = parser.parse_args() 68 | return args 69 | 70 | def gen_table(models, datasets): 71 | res = defaultdict(dict) 72 | for m in models: 73 | for d in datasets: 74 | try: 75 | res[m].update(get_score(m, d)) 76 | except: 77 | pass 78 | keys = [] 79 | for m in models: 80 | for d in res[m]: 81 | keys.append(d) 82 | keys = list(set(keys)) 83 | keys.sort() 84 | final = defaultdict(list) 85 | for m in models: 86 | final['Model'].append(m) 87 | for k in keys: 88 | if k in res[m]: 89 | final[k].append(res[m][k]) 90 | else: 91 | final[k].append(None) 92 | final = pd.DataFrame(final) 93 | dump(final, 'summ.csv') 94 | if len(final) >= len(final.iloc[0].keys()): 95 | print(tabulate(final)) 96 | else: 97 | print(tabulate(final.T)) 98 | 99 | if __name__ == '__main__': 100 | args = parse_args() 101 | if args.data == []: 102 | args.data = list(dataset_URLs) 103 | gen_table(args.model, args.data) -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | pass 5 | 6 | from .smp import * 7 | from .api import * 8 | from .evaluate import * 9 | from .utils import * 10 | from .vlm import * 11 | from .config import * 12 | from .tools import cli 13 | 14 | load_env() 15 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal 3 | from .hf_chat_model import HFChatModel 4 | from .gemini import GeminiWrapper, GeminiProVision 5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI 6 | from .qwen_api import QwenAPI 7 | from .stepai import Step1V_INT 8 | from .claude import Claude_Wrapper, Claude3V 9 | from .reka import Reka 10 | from .glm_vision import GLMVisionAPI 11 | from .cloudwalk import CWWrapper 12 | 13 | __all__ = [ 14 | 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper', 15 | 'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 16 | 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI', 17 | 'CWWrapper' 18 | ] 19 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/claude.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import base64 5 | import mimetypes 6 | 7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat' 8 | headers = { 9 | 'alles-apin-token': '', 10 | 'Content-Type': 'application/json' 11 | } 12 | 13 | 14 | class Claude_Wrapper(BaseAPI): 15 | 16 | is_api: bool = True 17 | 18 | def __init__(self, 19 | model: str = 'claude-3-opus-20240229', 20 | key: str = None, 21 | retry: int = 10, 22 | wait: int = 3, 23 | system_prompt: str = None, 24 | verbose: bool = True, 25 | temperature: float = 0, 26 | max_tokens: int = 1024, 27 | **kwargs): 28 | 29 | self.model = model 30 | self.headers = headers 31 | self.temperature = temperature 32 | self.max_tokens = max_tokens 33 | if key is not None: 34 | self.key = key 35 | else: 36 | self.key = os.environ.get('ALLES', '') 37 | self.headers['alles-apin-token'] = self.key 38 | 39 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 40 | 41 | def build_msgs(self, msgs_raw): 42 | 43 | messages = [] 44 | message = {'role': 'user', 'content': []} 45 | for msg in msgs_raw: 46 | if msg['type'] == 'image': 47 | pth = msg['value'] 48 | suffix = osp.splitext(pth)[-1].lower() 49 | media_type = mimetypes.types_map.get(suffix, None) 50 | assert media_type is not None 51 | 52 | item = { 53 | 'type': 'image', 54 | 'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)} 55 | } 56 | 57 | elif msg['type'] == 'text': 58 | item = {'type': 'text', 'text': msg['value']} 59 | else: 60 | raise NotImplementedError(f'Unsupported message type: {msg["type"]}') 61 | 62 | message['content'].append(item) 63 | messages.append(message) 64 | return messages 65 | 66 | def generate_inner(self, inputs, **kwargs) -> str: 67 | 68 | payload = json.dumps({ 69 | 'model': self.model, 70 | 'max_tokens': self.max_tokens, 71 | 'messages': self.build_msgs(msgs_raw=inputs), 72 | **kwargs 73 | }) 74 | response = requests.request('POST', url, headers=headers, data=payload) 75 | 76 | ret_code = response.status_code 77 | retry = self.retry 78 | while ret_code == 429 and retry > 0: 79 | sleep(15) 80 | response = requests.request('POST', url, headers=headers, data=payload) 81 | ret_code = response.status_code 82 | retry -= 1 83 | 84 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 85 | answer = self.fail_msg 86 | 87 | try: 88 | resp_struct = json.loads(response.text) 89 | answer = resp_struct['data']['content'][0]['text'].strip() 90 | except: 91 | pass 92 | 93 | return ret_code, answer, response 94 | 95 | 96 | class Claude3V(Claude_Wrapper): 97 | 98 | def generate(self, message, dataset=None): 99 | return super(Claude_Wrapper, self).generate(message) 100 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/cloudwalk.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | import os 3 | from .base import BaseAPI 4 | 5 | 6 | class CWWrapper(BaseAPI): 7 | 8 | is_api: bool = True 9 | 10 | def __init__(self, 11 | model: str = 'cw-congrong-v1.5', 12 | retry: int = 10, 13 | wait: int = 5, 14 | key: str = None, 15 | verbose: bool = True, 16 | system_prompt: str = None, 17 | temperature: float = 0, 18 | timeout: int = 600, 19 | api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions', 20 | max_tokens: int = 1024, 21 | img_size: int = 512, 22 | img_detail: str = 'low', 23 | **kwargs): 24 | 25 | self.model = model 26 | self.cur_idx = 0 27 | self.fail_msg = 'Failed to obtain answer via API. ' 28 | self.max_tokens = max_tokens 29 | self.temperature = temperature 30 | 31 | base = os.environ.get('CW_API_BASE', None) 32 | self.api_base = base if base is not None else api_base 33 | 34 | env_key = os.environ.get('CW_API_KEY', None) 35 | self.key = env_key if env_key is not None else key 36 | assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \ 37 | pass it to the constructor.' 38 | 39 | assert img_size > 0 or img_size == -1 40 | self.img_size = -1 # allways send full size image 41 | assert img_detail in ['high', 'low'] 42 | self.img_detail = img_detail 43 | 44 | self.vision = True 45 | self.timeout = timeout 46 | 47 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] 50 | # content can be a string or a list of image & text 51 | def prepare_inputs(self, inputs): 52 | input_msgs = [] 53 | if self.system_prompt is not None: 54 | input_msgs.append(dict(role='system', content=self.system_prompt)) 55 | has_images = np.sum([x['type'] == 'image' for x in inputs]) 56 | if has_images: 57 | content_list = [] 58 | for msg in inputs: 59 | if msg['type'] == 'text': 60 | content_list.append(dict(type='text', text=msg['value'])) 61 | elif msg['type'] == 'image': 62 | from PIL import Image 63 | img = Image.open(msg['value']) 64 | b64 = encode_image_to_base64(img, target_size=self.img_size) 65 | img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail) 66 | content_list.append(dict(type='image_url', image_url=img_struct)) 67 | input_msgs.append(dict(role='user', content=content_list)) 68 | else: 69 | assert all([x['type'] == 'text' for x in inputs]) 70 | text = '\n'.join([x['value'] for x in inputs]) 71 | input_msgs.append(dict(role='user', content=text)) 72 | return input_msgs 73 | 74 | def generate_inner(self, inputs, **kwargs) -> str: 75 | input_msgs = self.prepare_inputs(inputs) 76 | temperature = kwargs.pop('temperature', self.temperature) 77 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 78 | 79 | if 0 < max_tokens <= 100: 80 | self.logger.warning( 81 | 'Less than 100 tokens left, ' 82 | 'may exceed the context window with some additional meta symbols. ' 83 | ) 84 | if max_tokens <= 0: 85 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 86 | 87 | headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'} 88 | payload = dict( 89 | model=self.model, 90 | messages=input_msgs, 91 | max_tokens=max_tokens, 92 | n=1, 93 | temperature=temperature, 94 | **kwargs) 95 | response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 96 | ret_code = response.status_code 97 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 98 | answer = self.fail_msg 99 | try: 100 | resp_struct = json.loads(response.text) 101 | answer = resp_struct['choices'][0]['message']['content'].strip() 102 | except: 103 | pass 104 | return ret_code, answer, response 105 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/glm_vision.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from vlmeval.utils.dataset import DATASET_TYPE 4 | from vlmeval.smp.vlm import encode_image_file_to_base64 5 | 6 | 7 | class GLMVisionWrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | 11 | def __init__(self, 12 | model: str, 13 | retry: int = 5, 14 | wait: int = 5, 15 | key: str = None, 16 | verbose: bool = True, 17 | system_prompt: str = None, 18 | max_tokens: int = 1024, 19 | proxy: str = None, 20 | **kwargs): 21 | 22 | self.model = model 23 | self.fail_msg = 'Failed to obtain answer via API. ' 24 | self.default_params = { 25 | 'top_p': 0.6, 26 | 'top_k': 2, 27 | 'temperature': 0.8, 28 | 'repetition_penalty': 1.1, 29 | 'best_of': 1, 30 | 'do_sample': True, 31 | 'stream': False, 32 | 'max_tokens': max_tokens 33 | } 34 | if key is None: 35 | key = os.environ.get('GLMV_API_KEY', None) 36 | assert key is not None, ( 37 | 'Please set the API Key (obtain it here: ' 38 | 'https://open.bigmodel.cn/dev/howuse/introduction)' 39 | ) 40 | self.key = key 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | def image_to_base64(self, image_path): 44 | import base64 45 | with open(image_path, 'rb') as image_file: 46 | encoded_string = base64.b64encode(image_file.read()) 47 | return encoded_string.decode('utf-8') 48 | 49 | def build_msgs(self, msgs_raw, system_prompt=None, dataset=None): 50 | msgs = cp.deepcopy(msgs_raw) 51 | content = [] 52 | text = '' 53 | for i, msg in enumerate(msgs): 54 | if msg['type'] == 'text': 55 | text += msg['value'] 56 | elif msg['type'] == 'image': 57 | content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value'])))) 58 | if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 59 | text += '\nShort Answer.' 60 | content.append(dict(type='text', text=text)) 61 | ret = [dict(role='user', content=content)] 62 | return ret 63 | 64 | def generate_inner(self, inputs, **kwargs) -> str: 65 | assert isinstance(inputs, str) or isinstance(inputs, list) 66 | inputs = [inputs] if isinstance(inputs, str) else inputs 67 | 68 | messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None)) 69 | 70 | url = 'https://api.chatglm.cn/v1/chat/completions' 71 | headers = { 72 | 'Content-Type': 'application/json', 73 | 'Request-Id': 'remote-test', 74 | 'Authorization': f'Bearer {self.key}' 75 | } 76 | payload = { 77 | 'model': self.model, 78 | 'messages': messages, 79 | **self.default_params 80 | } 81 | response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False) 82 | output = [] 83 | try: 84 | assert response.status_code == 200 85 | for line in response.iter_lines(): 86 | data = json.loads(line.decode('utf-8').lstrip('data: ')) 87 | output.append(data['choices'][0]['message']['content']) 88 | answer = ''.join(output).replace('', '') 89 | if self.verbose: 90 | self.logger.info(f'inputs: {inputs}\nanswer: {answer}') 91 | return 0, answer, 'Succeeded! ' 92 | except Exception as err: 93 | if self.verbose: 94 | self.logger.error(err) 95 | self.logger.error(f'The input messages are {inputs}.') 96 | return -1, self.fail_msg, '' 97 | 98 | 99 | class GLMVisionAPI(GLMVisionWrapper): 100 | 101 | def generate(self, message, dataset=None): 102 | return super(GLMVisionAPI, self).generate(message, dataset=dataset) 103 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/gpt_int.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | import requests 4 | from ..smp import * 5 | from .gpt import GPT_context_window, OpenAIWrapper 6 | 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat' 8 | headers = { 9 | 'Content-Type': 'application/json' 10 | } 11 | 12 | 13 | class OpenAIWrapperInternal(OpenAIWrapper): 14 | 15 | is_api: bool = True 16 | 17 | def __init__(self, 18 | model: str = 'gpt-3.5-turbo-0125', 19 | retry: int = 5, 20 | wait: int = 3, 21 | verbose: bool = True, 22 | system_prompt: str = None, 23 | temperature: float = 0, 24 | timeout: int = 60, 25 | max_tokens: int = 1024, 26 | img_size: int = 512, 27 | img_detail: str = 'low', 28 | **kwargs): 29 | 30 | self.model = model 31 | if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']): 32 | keys = load(os.environ['KEYS']) 33 | headers['alles-apin-token'] = keys.get('alles-apin-token', '') 34 | elif 'ALLES' in os.environ: 35 | headers['alles-apin-token'] = os.environ['ALLES'] 36 | self.headers = headers 37 | self.temperature = temperature 38 | self.timeout = timeout 39 | self.max_tokens = max_tokens 40 | 41 | assert img_size > 0 or img_size == -1 42 | self.img_size = img_size 43 | assert img_detail in ['high', 'low'] 44 | self.img_detail = img_detail 45 | 46 | super(OpenAIWrapper, self).__init__( 47 | wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | def generate_inner(self, inputs, **kwargs) -> str: 50 | input_msgs = self.prepare_inputs(inputs) 51 | 52 | temperature = kwargs.pop('temperature', self.temperature) 53 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 54 | 55 | # Held out 100 tokens as buffer 56 | context_window = GPT_context_window(self.model) 57 | max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) 58 | if 0 < max_tokens <= 100: 59 | print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ') 60 | if max_tokens <= 0: 61 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 62 | 63 | payload = dict( 64 | model=self.model, 65 | messages=input_msgs, 66 | max_tokens=max_tokens, 67 | n=1, 68 | stop=None, 69 | timeout=self.timeout, 70 | temperature=temperature, 71 | **kwargs) 72 | 73 | response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 74 | ret_code = response.status_code 75 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 76 | 77 | answer = self.fail_msg 78 | try: 79 | resp_struct = json.loads(response.text) 80 | assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct 81 | answer = resp_struct['data']['choices'][0]['message']['content'].strip() 82 | except: 83 | pass 84 | return ret_code, answer, response 85 | 86 | 87 | class GPT4V_Internal(OpenAIWrapperInternal): 88 | 89 | def generate(self, message, dataset=None): 90 | return super(GPT4V_Internal, self).generate(message) 91 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/qwen_api.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | import os 3 | from vlmeval.api.base import BaseAPI 4 | from vlmeval.smp import * 5 | 6 | 7 | # Note: This is a pure language model API. 8 | class QwenAPI(BaseAPI): 9 | 10 | is_api: bool = True 11 | 12 | def __init__(self, 13 | model: str = 'qwen-max-1201', 14 | retry: int = 5, 15 | wait: int = 5, 16 | verbose: bool = True, 17 | seed: int = 2680, 18 | temperature: float = 0.0, 19 | system_prompt: str = None, 20 | key: str = None, 21 | max_tokens: int = 1024, 22 | proxy: str = None, 23 | **kwargs): 24 | 25 | assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext'] 26 | self.model = model 27 | import dashscope 28 | self.fail_msg = 'Failed to obtain answer via API. ' 29 | self.max_tokens = max_tokens 30 | self.temperature = temperature 31 | self.seed = seed 32 | if key is None: 33 | key = os.environ.get('DASHSCOPE_API_KEY', None) 34 | assert key is not None, ( 35 | 'Please set the API Key (obtain it here: ' 36 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 37 | ) 38 | dashscope.api_key = key 39 | if proxy is not None: 40 | proxy_set(proxy) 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | @staticmethod 44 | def build_msgs(msgs_raw, system_prompt=None): 45 | msgs = cp.deepcopy(msgs_raw) 46 | ret = [] 47 | if system_prompt is not None: 48 | ret.append(dict(role='system', content=system_prompt)) 49 | for i, msg in enumerate(msgs): 50 | role = 'user' if i % 2 == 0 else 'assistant' 51 | ret.append(dict(role=role, content=msg)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | inputs = [inputs] if isinstance(inputs, str) else inputs 58 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 59 | 60 | import dashscope 61 | response = dashscope.Generation.call( 62 | model=self.model, 63 | messages=messages, 64 | seed=self.seed, 65 | temperature=self.temperature, 66 | max_tokens=self.max_tokens, 67 | result_format='message', # set the result to be "message" format. 68 | ) 69 | if response.status_code != HTTPStatus.OK: 70 | return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. ' 71 | 72 | try: 73 | return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! ' 74 | except Exception as err: 75 | return -1, f'Error: Failed to parse the response. {err}', response 76 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/qwen_vl_api.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | 5 | class QwenVLWrapper(BaseAPI): 6 | 7 | is_api: bool = True 8 | 9 | def __init__(self, 10 | model: str = 'qwen-vl-plus', 11 | retry: int = 5, 12 | wait: int = 5, 13 | key: str = None, 14 | verbose: bool = True, 15 | temperature: float = 0.0, 16 | system_prompt: str = None, 17 | max_tokens: int = 1024, 18 | proxy: str = None, 19 | **kwargs): 20 | 21 | assert model in ['qwen-vl-plus', 'qwen-vl-max'] 22 | self.model = model 23 | import dashscope 24 | self.fail_msg = 'Failed to obtain answer via API. ' 25 | self.max_tokens = max_tokens 26 | self.temperature = temperature 27 | if key is None: 28 | key = os.environ.get('DASHSCOPE_API_KEY', None) 29 | assert key is not None, ( 30 | 'Please set the API Key (obtain it here: ' 31 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 32 | ) 33 | dashscope.api_key = key 34 | if proxy is not None: 35 | proxy_set(proxy) 36 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 37 | 38 | @staticmethod 39 | def build_msgs(msgs_raw, system_prompt=None): 40 | msgs = cp.deepcopy(msgs_raw) 41 | ret = [] 42 | if system_prompt is not None: 43 | content = list(dict(text=system_prompt)) 44 | ret.append(dict(role='system', content=content)) 45 | content = [] 46 | for msg in msgs: 47 | if msg['type'] == 'text': 48 | content.append(dict(text=msg['value'])) 49 | elif msg['type'] == 'image': 50 | content.append(dict(image='file://' + msg['value'])) 51 | ret.append(dict(role='user', content=content)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | pure_text = np.all([x['type'] == 'text' for x in inputs]) 58 | assert not pure_text 59 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 60 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature) 61 | gen_config.update(kwargs) 62 | try: 63 | response = MultiModalConversation.call(model=self.model, messages=messages) 64 | if self.verbose: 65 | print(response) 66 | answer = response.output.choices[0]['message']['content'][0]['text'] 67 | return 0, answer, 'Succeeded! ' 68 | except Exception as err: 69 | if self.verbose: 70 | self.logger.error(err) 71 | self.logger.error(f'The input messages are {inputs}.') 72 | 73 | return -1, '', '' 74 | 75 | 76 | class QwenVLAPI(QwenVLWrapper): 77 | 78 | def generate(self, message, dataset=None): 79 | return super(QwenVLAPI, self).generate(message) 80 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/reka.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import mimetypes 5 | 6 | 7 | class Reka_Wrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | INTERLEAVE: bool = False 11 | 12 | def __init__(self, 13 | model: str = 'reka-flash-20240226', 14 | key: str = None, 15 | retry: int = 10, 16 | wait: int = 3, 17 | system_prompt: str = None, 18 | verbose: bool = True, 19 | temperature: float = 0, 20 | max_tokens: int = 1024, 21 | **kwargs): 22 | 23 | try: 24 | import reka 25 | except ImportError: 26 | raise ImportError('Please install reka by running "pip install reka-api"') 27 | 28 | self.model = model 29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) 30 | default_kwargs.update(kwargs) 31 | self.kwargs = default_kwargs 32 | if key is not None: 33 | self.key = key 34 | else: 35 | self.key = os.environ.get('REKA_API_KEY', '') 36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 37 | 38 | def generate_inner(self, inputs, **kwargs) -> str: 39 | import reka 40 | reka.API_KEY = self.key 41 | prompt, image_path = self.message_to_promptimg(inputs) 42 | image_b64 = encode_image_file_to_base64(image_path) 43 | 44 | response = reka.chat( 45 | model_name=self.model, 46 | human=prompt, 47 | media_url=f'data:image/jpeg;base64,{image_b64}', 48 | **self.kwargs) 49 | 50 | try: 51 | return 0, response['text'], response 52 | except: 53 | return -1, self.fail_msg, response 54 | 55 | 56 | class Reka(Reka_Wrapper): 57 | 58 | def generate(self, message, dataset=None): 59 | return super(Reka_Wrapper, self).generate(message) 60 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/api/stepai.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | url = 'https://api.stepfun.com/v1/chat/completions' 5 | headers = { 6 | 'Content-Type': 'application/json', 7 | 'Authorization': 'Bearer {}', 8 | } 9 | 10 | 11 | class StepAPI_INT(BaseAPI): 12 | 13 | is_api: bool = True 14 | 15 | def __init__(self, 16 | model: str = 'step-1v-8k', 17 | retry: int = 10, 18 | wait: int = 3, 19 | key: str = None, 20 | temperature: float = 0, 21 | max_tokens: int = 300, 22 | verbose: bool = True, 23 | system_prompt: str = None, 24 | **kwargs): 25 | self.model = model 26 | self.fail_msg = 'Fail to obtain answer via API.' 27 | self.headers = headers 28 | self.temperature = temperature 29 | self.max_tokens = max_tokens 30 | self.system_prompt = system_prompt 31 | if key is not None: 32 | self.key = key 33 | else: 34 | self.key = os.environ.get('STEPAI_API_KEY', '') 35 | headers['Authorization'] = headers['Authorization'].format(self.key) 36 | 37 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 38 | 39 | @staticmethod 40 | def build_msgs(msgs_raw): 41 | messages = [] 42 | message = {'role': 'user', 'content': []} 43 | 44 | for msg in msgs_raw: 45 | if msg['type'] == 'image': 46 | image_b64 = encode_image_file_to_base64(msg['value']) 47 | message['content'].append({ 48 | 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)}, 49 | 'type': 'image_url' 50 | }) 51 | elif msg['type'] == 'text': 52 | message['content'].append({ 53 | 'text': msg['value'], 54 | 'type': 'text' 55 | }) 56 | 57 | messages.append(message) 58 | return messages 59 | 60 | def generate_inner(self, inputs, **kwargs) -> str: 61 | print(inputs, '\n') 62 | payload = dict( 63 | model=self.model, 64 | max_tokens=self.max_tokens, 65 | temperature=self.temperature, 66 | messages=self.build_msgs(msgs_raw=inputs), 67 | **kwargs) 68 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 69 | ret_code = response.status_code 70 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 71 | 72 | answer = self.fail_msg 73 | try: 74 | resp_struct = json.loads(response.text) 75 | answer = resp_struct['choices'][0]['message']['content'].strip() 76 | except: 77 | pass 78 | return ret_code, answer, response 79 | 80 | 81 | class Step1V_INT(StepAPI_INT): 82 | 83 | def generate(self, message, dataset=None): 84 | return super(StepAPI_INT, self).generate(message) 85 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/evaluate/OCRBench.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | 3 | 4 | def OCRBench_eval(eval_file): 5 | OCRBench_score = { 6 | 'Regular Text Recognition': 0, 7 | 'Irregular Text Recognition': 0, 8 | 'Artistic Text Recognition': 0, 9 | 'Handwriting Recognition': 0, 10 | 'Digit String Recognition': 0, 11 | 'Non-Semantic Text Recognition': 0, 12 | 'Scene Text-centric VQA': 0, 13 | 'Doc-oriented VQA': 0, 14 | 'Key Information Extraction': 0, 15 | 'Handwritten Mathematical Expression Recognition': 0 16 | } 17 | 18 | logger = get_logger('Evaluation') 19 | 20 | data = load(eval_file) 21 | lt = len(data) 22 | lines = [data.iloc[i] for i in range(lt)] 23 | for i in tqdm(range(len(lines))): 24 | line = lines[i] 25 | predict = str(line['prediction']) 26 | answers = eval(line['answer']) 27 | category = line['category'] 28 | if category == 'Handwritten Mathematical Expression Recognition': 29 | for j in range(len(answers)): 30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '') 31 | predict = predict.strip().replace('\n', ' ').replace(' ', '') 32 | if answer in predict: 33 | OCRBench_score[category] += 1 34 | break 35 | else: 36 | for j in range(len(answers)): 37 | answer = answers[j].lower().strip().replace('\n', ' ') 38 | predict = predict.lower().strip().replace('\n', ' ') 39 | if answer in predict: 40 | OCRBench_score[category] += 1 41 | break 42 | 43 | final_score_dict = {} 44 | final_score_dict['Text Recognition'] = ( 45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] 46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] 47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'] 48 | ) 49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] 50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] 51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] 52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \ 53 | OCRBench_score['Handwritten Mathematical Expression Recognition'] 54 | final_score_dict['Final Score'] = ( 55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] 56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] 57 | + final_score_dict['Handwritten Mathematical Expression Recognition'] 58 | ) 59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 60 | score_pth = eval_file.replace('.xlsx', '_score.json') 61 | dump(final_score_dict, score_pth) 62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 63 | logger.info('Score: ') 64 | for key, value in final_score_dict.items(): 65 | logger.info('{}:{}'.format(key, value)) 66 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval 2 | from .mmvet_eval import MMVet_eval 3 | from .multiple_choice import multiple_choice_eval 4 | from .coco_eval import COCO_eval 5 | from .vqa_eval import VQAEval 6 | from .mathvista_eval import MathVista_eval 7 | from .llavabench import LLaVABench_eval 8 | from .misc import build_judge 9 | from .OCRBench import OCRBench_eval 10 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/evaluate/coco_eval.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from pycocoevalcap.bleu.bleu import Bleu 3 | from pycocoevalcap.rouge.rouge import Rouge 4 | from pycocoevalcap.cider.cider import Cider 5 | 6 | 7 | class COCO_Caption_Scorer(): 8 | def __init__(self, ref, gt): 9 | self.ref = ref 10 | self.gt = gt 11 | print('setting up scorers...') 12 | self.scorers = [ 13 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), 14 | # (Meteor(), "METEOR"), # need java version 11.0.16+ 15 | (Rouge(), 'ROUGE_L'), 16 | (Cider(), 'CIDEr'), 17 | # (Spice(), "SPICE"), # need java version 11.0.16+ 18 | ] 19 | 20 | def compute_scores(self): 21 | total_scores = {} 22 | for scorer, method in self.scorers: 23 | print('computing %s score...' % (scorer.method())) 24 | score, scores = scorer.compute_score(self.gt, self.ref) 25 | if type(method) == list: 26 | for sc, scs, m in zip(score, scores, method): 27 | print('%s: %0.3f' % (m, sc * 100)) 28 | total_scores['Bleu'] = [x * 100 for x in score] 29 | else: 30 | print('%s: %0.3f' % (method, score * 100)) 31 | total_scores[method] = score * 100 32 | 33 | print('*****DONE*****') 34 | for key, value in total_scores.items(): 35 | print('{}:{}'.format(key, value)) 36 | return total_scores 37 | 38 | 39 | def COCO_eval(eval_file, nproc=4, verbose=False): 40 | logger = get_logger('Evaluation') 41 | 42 | data = load(eval_file) 43 | 44 | lt = len(data) 45 | lines = [data.iloc[i] for i in range(lt)] 46 | ref = {} 47 | gt = {} 48 | for i, line in enumerate(lines): 49 | ref[str(i)] = [str(line['prediction'])] 50 | gt[str(i)] = eval(line['answer']) 51 | 52 | scorer = COCO_Caption_Scorer(ref, gt) 53 | coco_caption_score_dict = scorer.compute_scores() 54 | 55 | score_pth = eval_file.replace('.xlsx', '_score.json') 56 | dump(coco_caption_score_dict, score_pth) 57 | logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 58 | logger.info('Score: ') 59 | for key, value in coco_caption_score_dict.items(): 60 | logger.info('{}:{}'.format(key, value)) 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser(description='Inference LLM Answers. ') 65 | parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ') 66 | parser.add_argument('--nproc', type=int, default=4) 67 | parser.add_argument('--verbose', action='store_true') 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose) 75 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/evaluate/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal 3 | from vlmeval.smp import load_env 4 | 5 | INTERNAL = os.environ.get('INTERNAL', 0) 6 | 7 | 8 | def build_judge(**kwargs): 9 | model = kwargs.pop('model', None) 10 | load_env() 11 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 12 | if LOCAL_LLM is None: 13 | model_map = { 14 | 'gpt-4-turbo': 'gpt-4-1106-preview', 15 | 'gpt-4-0613': 'gpt-4-0613', 16 | 'gpt-4-0125': 'gpt-4-0125-preview', 17 | 'gpt-4-0409': 'gpt-4-turbo-2024-04-09', 18 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 19 | 'chatgpt-0125': 'gpt-3.5-turbo-0125', 20 | } 21 | model_version = model_map[model] 22 | else: 23 | model_version = LOCAL_LLM 24 | if INTERNAL: 25 | model = OpenAIWrapperInternal(model_version, **kwargs) 26 | else: 27 | model = OpenAIWrapper(model_version, **kwargs) 28 | return model 29 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 7 | logger = logging.getLogger(name) 8 | if name in logger_initialized: 9 | return logger 10 | 11 | for logger_name in logger_initialized: 12 | if name.startswith(logger_name): 13 | return logger 14 | 15 | stream_handler = logging.StreamHandler() 16 | handlers = [stream_handler] 17 | 18 | try: 19 | import torch.distributed as dist 20 | if dist.is_available() and dist.is_initialized(): 21 | rank = dist.get_rank() 22 | else: 23 | rank = 0 24 | except ImportError: 25 | rank = 0 26 | 27 | if rank == 0 and log_file is not None: 28 | file_handler = logging.FileHandler(log_file, file_mode) 29 | handlers.append(file_handler) 30 | 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 33 | for handler in handlers: 34 | handler.setFormatter(formatter) 35 | handler.setLevel(log_level) 36 | logger.addHandler(handler) 37 | 38 | if rank == 0: 39 | logger.setLevel(log_level) 40 | else: 41 | logger.setLevel(logging.ERROR) 42 | 43 | logger_initialized[name] = True 44 | return logger 45 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/smp/vlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import pandas as pd 4 | import numpy as np 5 | import string 6 | from uuid import uuid4 7 | import os.path as osp 8 | import base64 9 | from PIL import Image 10 | from .file import load, dump 11 | Image.MAX_IMAGE_PIXELS = 1e9 12 | 13 | 14 | def mmqa_display(question, target_size=512): 15 | question = {k.lower(): v for k, v in question.items()} 16 | keys = list(question.keys()) 17 | keys = [k for k in keys if k not in ['index', 'image']] 18 | 19 | images = question['image'] 20 | if isinstance(images, str): 21 | images = [images] 22 | 23 | idx = question.pop('index', 'XXX') 24 | print(f'INDEX: {idx}') 25 | 26 | for im in images: 27 | image = decode_base64_to_image(im, target_size=target_size) 28 | display(image) # noqa: F821 29 | 30 | for k in keys: 31 | try: 32 | if not pd.isna(question[k]): 33 | print(f'{k.upper()}. {question[k]}') 34 | except ValueError: 35 | if False in pd.isna(question[k]): 36 | print(f'{k.upper()}. {question[k]}') 37 | 38 | 39 | def encode_image_to_base64(img, target_size=-1): 40 | # if target_size == -1, will not do resizing 41 | # else, will set the max_size ot (target_size, target_size) 42 | if img.mode in ('RGBA', 'P'): 43 | img = img.convert('RGB') 44 | tmp = osp.join('/tmp', str(uuid4()) + '.jpg') 45 | if target_size > 0: 46 | img.thumbnail((target_size, target_size)) 47 | img.save(tmp) 48 | with open(tmp, 'rb') as image_file: 49 | image_data = image_file.read() 50 | ret = base64.b64encode(image_data).decode('utf-8') 51 | os.remove(tmp) 52 | return ret 53 | 54 | 55 | def encode_image_file_to_base64(image_path, target_size=-1): 56 | image = Image.open(image_path) 57 | return encode_image_to_base64(image, target_size=target_size) 58 | 59 | 60 | def decode_base64_to_image(base64_string, target_size=-1): 61 | image_data = base64.b64decode(base64_string) 62 | image = Image.open(io.BytesIO(image_data)) 63 | if image.mode in ('RGBA', 'P'): 64 | image = image.convert('RGB') 65 | if target_size > 0: 66 | image.thumbnail((target_size, target_size)) 67 | return image 68 | 69 | 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1): 71 | image = decode_base64_to_image(base64_string, target_size=target_size) 72 | image.save(image_path) 73 | 74 | 75 | def build_option_str(option_dict): 76 | s = 'There are several options: \n' 77 | for c, content in option_dict.items(): 78 | if not pd.isna(content): 79 | s += f'{c}. {content}\n' 80 | return s 81 | 82 | 83 | def isimg(s): 84 | return osp.exists(s) or s.startswith('http') 85 | 86 | 87 | def read_ok(img_path): 88 | if not osp.exists(img_path): 89 | return False 90 | try: 91 | im = Image.open(img_path) 92 | assert im.size[0] > 0 and im.size[1] > 0 93 | return True 94 | except: 95 | return False 96 | 97 | 98 | def gpt_key_set(): 99 | openai_key = os.environ.get('OPENAI_API_KEY', None) 100 | return isinstance(openai_key, str) and openai_key.startswith('sk-') 101 | 102 | 103 | def apiok(wrapper): 104 | s = wrapper.generate('Hello!') 105 | return wrapper.fail_msg not in s 106 | 107 | 108 | def circular_pred(df, extract_func=None): 109 | if extract_func is None: 110 | extract_func = lambda x: x # noqa: E731 111 | df = df.sort_values('index') 112 | from vlmeval.utils import can_infer_option 113 | shift = int(1e6) 114 | 115 | choices = [extract_func(x) for x in df['prediction']] 116 | pred_map = {i: c for i, c in zip(df['index'], choices)} 117 | flag_map = {i: True for i in pred_map if i < 1e6} 118 | valid_map = {i: True for i in pred_map if i < 1e6} 119 | for i in df['index']: 120 | if i >= shift and pred_map[i] and pred_map[i - shift]: 121 | if ( 122 | pred_map[i] not in list(string.ascii_uppercase) or # noqa: W504 123 | pred_map[i - shift] not in list(string.ascii_uppercase) 124 | ): 125 | 126 | valid_map[i % shift] = False 127 | continue 128 | if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1: 129 | continue 130 | else: 131 | flag_map[i % shift] = False 132 | flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} 133 | flags = list(flag_map.values()) 134 | return np.mean(flags) 135 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text 2 | from .mp_util import track_progress_rich 3 | from .custom_prompt import CustomPrompt 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full 5 | from .dataset import TSVDataset, split_MMMU 6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer 7 | 8 | 9 | __all__ = [ 10 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 11 | 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt', 12 | 'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer' 13 | ] 14 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/utils/custom_prompt.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | from .dataset_config import img_root_map 3 | from abc import abstractmethod 4 | 5 | 6 | class CustomPrompt: 7 | 8 | @abstractmethod 9 | def use_custom_prompt(self, dataset): 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def build_prompt(self, line, dataset): 14 | raise NotImplementedError 15 | 16 | def dump_image(self, line, dataset): 17 | ROOT = LMUDataRoot() 18 | assert isinstance(dataset, str) 19 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) 20 | os.makedirs(img_root, exist_ok=True) 21 | 22 | if 'image' in line: 23 | if isinstance(line['image'], list): 24 | tgt_path = [] 25 | assert 'image_path' in line 26 | for img, im_name in zip(line['image'], line['image_path']): 27 | path = osp.join(img_root, im_name) 28 | if not read_ok(path): 29 | decode_base64_to_image_file(img, path) 30 | tgt_path.append(path) 31 | else: 32 | tgt_path = osp.join(img_root, f"{line['index']}.jpg") 33 | if not read_ok(tgt_path): 34 | decode_base64_to_image_file(line['image'], tgt_path) 35 | tgt_path = [tgt_path] 36 | else: 37 | assert 'image_path' in line 38 | tgt_path = toliststr(line['image_path']) 39 | 40 | return tgt_path 41 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/utils/matching_util.py: -------------------------------------------------------------------------------- 1 | import string 2 | import copy as cp 3 | import os 4 | from ..smp import * 5 | 6 | 7 | def can_infer_option(answer, choices): 8 | verbose = os.environ.get('VERBOSE', 0) 9 | # Choices is a dictionary 10 | if 'Failed to obtain answer via API' in answer: 11 | return False 12 | 13 | reject_to_answer = [ 14 | "Sorry, I can't help with images of people yet.", 15 | "I can't process this file.", 16 | "I'm sorry, but without the image provided", 17 | 'Cannot determine the answer' 18 | ] 19 | for err in reject_to_answer: 20 | if err in answer: 21 | return 'Z' 22 | 23 | def count_choice(splits, choices, prefix='', suffix=''): 24 | cnt = 0 25 | for c in choices: 26 | if prefix + c + suffix in splits: 27 | cnt += 1 28 | return cnt 29 | 30 | answer_mod = cp.copy(answer) 31 | chars = '.()[],:;!*#{}' 32 | for c in chars: 33 | answer_mod = answer_mod.replace(c, ' ') 34 | 35 | splits = [x.strip() for x in answer_mod.split()] 36 | count = count_choice(splits, choices) 37 | 38 | if count == 1: 39 | for ch in choices: 40 | if 'A' in splits and len(splits) > 3 and verbose: 41 | logger = get_logger('Evaluation') 42 | logger.info(f'A might be a quantifier in the string: {answer}.') 43 | return False 44 | if ch in splits: 45 | return ch 46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1: 47 | return 'Z' 48 | return False 49 | 50 | 51 | def can_infer_text(answer, choices): 52 | answer = answer.lower() 53 | assert isinstance(choices, dict) 54 | for k in choices: 55 | assert k in string.ascii_uppercase 56 | choices[k] = str(choices[k]).lower() 57 | cands = [] 58 | for k in choices: 59 | if choices[k] in answer: 60 | cands.append(k) 61 | if len(cands) == 1: 62 | return cands[0] 63 | return False 64 | 65 | 66 | def can_infer(answer, choices): 67 | answer = str(answer) 68 | copt = can_infer_option(answer, choices) 69 | return copt if copt else can_infer_text(answer, choices) 70 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/utils/result_transfer.py: -------------------------------------------------------------------------------- 1 | from ..evaluate.misc import build_judge 2 | from ..evaluate.multiple_choice import extract_answer_from_item 3 | 4 | from ..smp import * 5 | from .matching_util import can_infer 6 | from .mp_util import track_progress_rich 7 | 8 | 9 | def MMMU_result_transfer(result_path): 10 | res = {} 11 | result_data = load(result_path) 12 | mcq = result_data['A'].notna() 13 | lt = len(result_data) 14 | for i in range(lt): 15 | line = result_data.iloc[i] 16 | if mcq[i]: 17 | options = { 18 | cand: line[cand] 19 | for cand in string.ascii_uppercase 20 | if cand in line and not pd.isna(line[cand]) 21 | } 22 | prediction = line['prediction'] 23 | infer_prediction = can_infer(prediction, options) 24 | res[line['id']] = infer_prediction 25 | else: 26 | res[line['id']] = line['prediction'] 27 | result_json = result_path.replace('.xlsx', '.json') 28 | dump(res, result_json) 29 | return result_json 30 | 31 | 32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): 33 | logger = get_logger('Evaluation') 34 | INTERNAL = os.environ.get('INTERNAL', 0) 35 | nproc = judge_kwargs.pop('nproc', 4) 36 | 37 | rd.seed(2680) 38 | suffix = eval_file.split('.')[-1] 39 | model = judge_kwargs['model'] 40 | assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] 41 | name_str_map = { 42 | 'chatgpt-0125': 'openai', 43 | 'gpt-4-0125': 'gpt4' 44 | } 45 | name_str = name_str_map[model] if model in name_str_map else model 46 | 47 | if model == 'exact_matching': 48 | model = None 49 | else: 50 | if INTERNAL or gpt_key_set(): 51 | model = build_judge(**judge_kwargs) 52 | else: 53 | logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') 54 | model = None 55 | 56 | logger.info(f'Evaluating {eval_file}') 57 | result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') 58 | result = {} 59 | if osp.exists(result_file): 60 | result = load(result_file) 61 | 62 | data = load(eval_file) 63 | assert 'index' in data, 'Essentail columns missing in the eval_file.' 64 | 65 | data = data.sort_values(by='index') 66 | data['prediction'] = [str(x) for x in data['prediction']] 67 | for k in data.keys(): 68 | data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) 69 | 70 | idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))} 71 | idx2lines = {k: v for k, v in idx2lines.items() if k not in result} 72 | 73 | indices = list(idx2lines.keys()) 74 | lines = [idx2lines[i] for i in indices] 75 | tups = [(model, line) for line in lines] 76 | res = track_progress_rich( 77 | extract_answer_from_item, 78 | tups, 79 | nproc=nproc, 80 | chunksize=nproc, 81 | save=result_file, 82 | keys=indices) 83 | 84 | for i, r in zip(indices, res): 85 | if i in result: 86 | assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log'] 87 | else: 88 | result[i] = r 89 | 90 | indices = list(data['index']) 91 | data['opt'] = [result[i]['opt'] for i in data['index']] 92 | data['log'] = [result[i]['log'] for i in data['index']] 93 | 94 | # load split 95 | output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') 96 | dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) 97 | return output_path 98 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .base import BaseModel 6 | from .cogvlm import CogVlm, GLM4v 7 | from .emu import Emu 8 | from .idefics import IDEFICS, IDEFICS2 9 | from .instructblip import InstructBLIP 10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner 11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V 12 | from .minigpt4 import MiniGPT4 13 | from .mmalaya import MMAlaya 14 | from .monkey import Monkey, MonkeyChat 15 | from .mplug_owl2 import mPLUG_Owl2 16 | from .omnilmm import OmniLMM12B 17 | from .open_flamingo import OpenFlamingo 18 | from .pandagpt import PandaGPT 19 | from .qwen_vl import QwenVL, QwenVLChat 20 | from .transcore_m import TransCoreM 21 | from .visualglm import VisualGLM 22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD 23 | from .yi_vl import Yi_VL 24 | from .internvl_chat import InternVLChat 25 | from .deepseek_vl import DeepSeekVL 26 | from .mgm import Mini_Gemini 27 | from .bunnyllama3 import BunnyLLama3 28 | from .vxverse import VXVERSE 29 | from .paligemma import PaliGemma 30 | from .qh_360vl import QH_360VL 31 | from .phi3_vision import Phi3Vision 32 | from .wemm import WeMM 33 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/bunnyllama3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | from .base import BaseModel 8 | from ..smp import * 9 | from ..utils import DATASET_TYPE 10 | 11 | 12 | class BunnyLLama3(BaseModel): 13 | 14 | INSTALL_REQ = False 15 | INTERLEAVE = False 16 | 17 | def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs): 18 | assert model_path is not None 19 | transformers.logging.set_verbosity_error() 20 | transformers.logging.disable_progress_bar() 21 | warnings.filterwarnings('ignore') 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 23 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True) 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | text = f"A chat between a curious user and an artificial intelligence assistant. \ 29 | The assistant gives helpful, detailed, and polite answers to the user's questions. \ 30 | USER: \n{prompt} ASSISTANT:" 31 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')] 32 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0) 33 | image = Image.open(image_path).convert('RGB') 34 | image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype) 35 | 36 | output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0] 37 | response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True) 38 | return response 39 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/deepseek_vl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from transformers import AutoModelForCausalLM 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class DeepSeekVL(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = True 12 | 13 | def check_install(self): 14 | try: 15 | import deepseek_vl 16 | except ImportError: 17 | warnings.warn( 18 | 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL') 19 | sys.exit(-1) 20 | 21 | def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs): 22 | self.check_install() 23 | assert model_path is not None 24 | self.model_path = model_path 25 | from deepseek_vl.models import VLChatProcessor 26 | 27 | self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) 28 | self.tokenizer = self.vl_chat_processor.tokenizer 29 | 30 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) 31 | self.model = model.to(torch.bfloat16).cuda().eval() 32 | 33 | torch.cuda.empty_cache() 34 | default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True) 35 | default_kwargs.update(kwargs) 36 | self.kwargs = default_kwargs 37 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 38 | 39 | def prepare_inputs(self, message): 40 | content, images = '', [] 41 | for s in message: 42 | if s['type'] == 'image': 43 | images.append(s['value']) 44 | content += '' 45 | elif s['type'] == 'text': 46 | content += s['value'] 47 | conversation = [ 48 | dict(role='User', content=content, images=images), 49 | dict(role='Assistant', content='') 50 | ] 51 | return conversation 52 | 53 | def generate_inner(self, message, dataset=None): 54 | conversation = self.prepare_inputs(message) 55 | from deepseek_vl.utils.io import load_pil_images 56 | pil_images = load_pil_images(conversation) 57 | prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True) 58 | prepare_inputs = prepare_inputs.to(self.model.device) 59 | inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs) 60 | 61 | outputs = self.model.language_model.generate( 62 | inputs_embeds=inputs_embeds, 63 | attention_mask=prepare_inputs.attention_mask, 64 | pad_token_id=self.tokenizer.eos_token_id, 65 | bos_token_id=self.tokenizer.bos_token_id, 66 | eos_token_id=self.tokenizer.eos_token_id, 67 | **self.kwargs) 68 | answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True) 69 | return answer 70 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/emu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class Emu(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, 15 | model_path='BAAI/Emu2-Chat', 16 | **kwargs): 17 | 18 | self.model_path = model_path 19 | assert osp.exists(model_path) or splitlen(model_path) == 2 20 | 21 | from transformers import AutoModelForCausalLM, AutoTokenizer 22 | from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model 23 | 24 | local_rank = os.environ.get('LOCAL_RANK', 0) 25 | 26 | device_num = torch.cuda.device_count() 27 | assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' 28 | assert device_num >= 2, 'You need at least 2 GPUs to use EMU' 29 | 30 | device_1 = local_rank 31 | device_2 = local_rank + device_num // 2 32 | 33 | torch.cuda.set_device(device_1) 34 | torch.cuda.set_device(device_2) 35 | 36 | tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat" 37 | self.tokenizer = tokenizer 38 | with init_empty_weights(): 39 | model = AutoModelForCausalLM.from_pretrained( 40 | model_path, # "BAAI/Emu2-Chat" 41 | torch_dtype=torch.bfloat16, 42 | low_cpu_mem_usage=True, 43 | trust_remote_code=True) 44 | 45 | device_map = infer_auto_device_map( 46 | model, 47 | max_memory={ 48 | device_1: '38GiB', 49 | device_2: '38GiB' 50 | }, 51 | no_split_module_classes=['Block', 'LlamaDecoderLayer']) 52 | 53 | # input and output logits should be on same device 54 | device_map['model.decoder.lm.lm_head'] = device_1 55 | 56 | model = dispatch_model( 57 | model, 58 | device_map=device_map).eval() 59 | 60 | self.model = model 61 | kwargs_default = dict(max_new_tokens=512, length_penalty=-1) 62 | kwargs_default.update(kwargs) 63 | self.kwargs = kwargs_default 64 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 65 | 66 | def generate_inner(self, message, dataset=None): 67 | query, images = '', [] 68 | for item in message: 69 | if item['type'] == 'image': 70 | images.append(Image.open(item['value']).convert('RGB')) 71 | query += '[]' 72 | elif item['type'] == 'text': 73 | query += item['value'] 74 | 75 | inputs = self.model.build_input_ids( 76 | text=[query], 77 | tokenizer=self.tokenizer, 78 | image=images 79 | ) 80 | 81 | with torch.no_grad(): 82 | outputs = self.model.generate( 83 | input_ids=inputs['input_ids'], 84 | attention_mask=inputs['attention_mask'], 85 | image=inputs['image'].to(torch.bfloat16), 86 | **self.kwargs) 87 | 88 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) 89 | return output_text[0] 90 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/instructblip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import os.path as osp 4 | import sys 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class InstructBLIP(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name): 15 | self.config_map = { 16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml', 17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml', 18 | } 19 | 20 | self.file_path = __file__ 21 | config_root = osp.dirname(self.file_path) 22 | 23 | try: 24 | from lavis.models import load_preprocess 25 | from omegaconf import OmegaConf 26 | from lavis.common.registry import registry 27 | except: 28 | warnings.warn('Please install lavis before using InstructBLIP. ') 29 | sys.exit(-1) 30 | 31 | assert name in self.config_map 32 | cfg_path = osp.join(config_root, self.config_map[name]) 33 | cfg = OmegaConf.load(cfg_path) 34 | 35 | model_cfg = cfg.model 36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2 37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct') 38 | model = model_cls.from_config(model_cfg) 39 | model.eval() 40 | 41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 42 | device = self.device 43 | model.to(device) 44 | self.model = model 45 | self.kwargs = {'max_length': 512} 46 | 47 | preprocess_cfg = cfg.preprocess 48 | vis_processors, _ = load_preprocess(preprocess_cfg) 49 | self.vis_processors = vis_processors 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message) 53 | vis_processors = self.vis_processors 54 | raw_image = Image.open(image_path).convert('RGB') 55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device) 56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt)) 57 | return outputs[0] 58 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava import LLaVA, LLaVA_Next 2 | from .llava_xtuner import LLaVA_XTuner 3 | 4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner'] 5 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/minigpt4.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from transformers import StoppingCriteriaList 6 | from .base import BaseModel 7 | 8 | 9 | class MiniGPT4(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, 15 | mode='v2', 16 | root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/', 17 | temperature=1, 18 | max_out_len=512): 19 | 20 | if root is None: 21 | warnings.warn( 22 | 'Please set root to the directory of MiniGPT-4, which is cloned from here: ' 23 | 'https://github.com/Vision-CAIR/MiniGPT-4. ' 24 | ) 25 | 26 | if mode == 'v2': 27 | cfg = 'minigptv2_eval.yaml' 28 | elif mode == 'v1_7b': 29 | cfg = 'minigpt4_7b_eval.yaml' 30 | elif mode == 'v1_13b': 31 | cfg = 'minigpt4_13b_eval.yaml' 32 | else: 33 | raise NotImplementedError 34 | 35 | self.mode = mode 36 | self.temperature = temperature 37 | self.max_out_len = max_out_len 38 | self.root = root 39 | this_dir = osp.dirname(__file__) 40 | 41 | self.cfg = osp.join(this_dir, 'misc', cfg) 42 | sys.path.append(self.root) 43 | 44 | from omegaconf import OmegaConf 45 | from minigpt4.common.registry import registry 46 | from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2 47 | 48 | device = torch.cuda.current_device() 49 | self.device = device 50 | 51 | cfg_path = self.cfg 52 | cfg = OmegaConf.load(cfg_path) 53 | 54 | model_cfg = cfg.model 55 | model_cfg.device_8bit = device 56 | model_cls = registry.get_model_class(model_cfg.arch) 57 | model = model_cls.from_config(model_cfg) 58 | model = model.to(device) 59 | model.eval() 60 | vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train 61 | vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) 62 | self.model = model 63 | self.vis_processor = vis_processor 64 | 65 | self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0 66 | stop_words_ids = [[835], [2277, 29937]] 67 | stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids] 68 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 69 | 70 | def generate_inner(self, message, dataset=None): 71 | from minigpt4.conversation.conversation import Chat 72 | prompt, image_path = self.message_to_promptimg(message) 73 | if self.mode == 'v2': 74 | chat = Chat(self.model, self.vis_processor, device=self.device) 75 | else: 76 | chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria) 77 | 78 | chat_state = self.CONV_VISION.copy() 79 | img_list = [] 80 | _ = chat.upload_img(image_path, chat_state, img_list) 81 | chat.encode_img(img_list) 82 | chat.ask(prompt, chat_state) 83 | with torch.inference_mode(): 84 | msg = chat.answer(conv=chat_state, img_list=img_list)[0] 85 | return msg 86 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/misc/minigpt4_13b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-13b-v0" 25 | 26 | datasets: 27 | cc_sbu_align: 28 | vis_processor: 29 | train: 30 | name: "blip2_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | 36 | run: 37 | task: image_text_pretrain 38 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/misc/minigpt4_7b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-7b-v0" 25 | 26 | 27 | datasets: 28 | cc_sbu_align: 29 | vis_processor: 30 | train: 31 | name: "blip2_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | 37 | run: 38 | task: image_text_pretrain 39 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/misc/minigptv2_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt_v2 3 | model_type: pretrain 4 | max_txt_len: 160 5 | end_sym: "" 6 | low_resource: True 7 | prompt_template: '[INST] {} [/INST]' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | lora_r: 64 10 | lora_alpha: 16 11 | 12 | # vit encoder 13 | image_size: 448 14 | drop_path_rate: 0 15 | use_grad_checkpoint: False 16 | vit_precision: "fp16" 17 | freeze_vit: True 18 | 19 | # generation configs 20 | prompt: "" 21 | 22 | # LLM 23 | llama_model: "please set this value to the path of llama2-chat-7b" 24 | 25 | datasets: 26 | cc_sbu_align: 27 | vis_processor: 28 | train: 29 | name: "blip2_image_eval" 30 | image_size: 448 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | 35 | run: 36 | task: image_text_pretrain 37 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/mmalaya.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | from PIL import Image 5 | from .base import BaseModel 6 | 7 | 8 | class MMAlaya(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs): 14 | assert model_path is not None 15 | self.model_path = model_path 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 17 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval() 18 | # need initialize tokenizer 19 | model.initialize_tokenizer(self.tokenizer) 20 | self.model = model.cuda() 21 | 22 | self.kwargs = kwargs 23 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 24 | torch.cuda.empty_cache() 25 | 26 | def generate_inner(self, message, dataset=None): 27 | # read image 28 | prompt, image_path = self.message_to_promptimg(message) 29 | image = Image.open(image_path).convert('RGB') 30 | # tokenize prompt, and proprecess image 31 | input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference( 32 | prompt, 33 | self.tokenizer, 34 | image, 35 | return_tensors='pt') 36 | with torch.inference_mode(): 37 | output_ids = self.model.generate( 38 | inputs=input_ids.cuda(), 39 | images=image_tensor.cuda(), 40 | do_sample=False, 41 | max_new_tokens=512, 42 | num_beams=1, 43 | use_cache=True, 44 | stopping_criteria=[stopping_criteria], 45 | ) 46 | # truncate input_ids in generate_ids and then decode to text 47 | input_token_len = input_ids.shape[1] 48 | response = self.tokenizer.batch_decode( 49 | output_ids[:, input_token_len:].cpu(), 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0].strip() 53 | return response 54 | 55 | 56 | if __name__ == '__main__': 57 | model = MMAlaya() 58 | response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。']) 59 | print(response) 60 | 61 | """ 62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit 63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py 64 | """ 65 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/open_flamingo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | import warnings 6 | from .base import BaseModel 7 | from ..smp import splitlen, get_cache_path 8 | from huggingface_hub import snapshot_download 9 | 10 | 11 | class OpenFlamingo(BaseModel): 12 | 13 | INSTALL_REQ = True 14 | INTERLEAVE = True 15 | 16 | def __init__(self, 17 | name, 18 | mpt_pth=None, 19 | ckpt_pth=None, 20 | **kwargs): 21 | 22 | if mpt_pth is None: 23 | warnings.warn( 24 | 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: ' 25 | 'https://huggingface.co/mosaicml/mpt-7b. ' 26 | ) 27 | sys.exit(-1) 28 | if ckpt_pth is None: 29 | warnings.warn( 30 | 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded ' 31 | 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. ' 32 | ) 33 | sys.exit(-1) 34 | else: 35 | if osp.exists(ckpt_pth): 36 | if ckpt_pth.endswith('checkpoint.pt'): 37 | pass 38 | elif osp.isdir(ckpt_pth): 39 | ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt') 40 | if not osp.exists(ckpt_pth): 41 | sys.exit(-1) 42 | elif splitlen(ckpt_pth, '/') == 2: 43 | cache_path = get_cache_path(ckpt_pth) 44 | if cache_path is None: 45 | snapshot_download(ckpt_pth) 46 | cache_path = get_cache_path(ckpt_pth) 47 | if cache_path is None: 48 | sys.exit(-1) 49 | else: 50 | ckpt_pth = osp.join(cache_path, 'checkpoint.pt') 51 | 52 | self.name = name 53 | assert name in ['v2'] 54 | self.mpt_pth = mpt_pth 55 | try: 56 | from open_flamingo import create_model_and_transforms 57 | except: 58 | raise ImportError('Please first install open_flamingo to use OpenFlamingo') 59 | model, image_processor, tokenizer = create_model_and_transforms( 60 | clip_vision_encoder_path='ViT-L-14', 61 | clip_vision_encoder_pretrained='openai', 62 | lang_encoder_path=mpt_pth, 63 | tokenizer_path=mpt_pth, 64 | cross_attn_every_n_layers=4) 65 | ckpt = torch.load(ckpt_pth) 66 | model.load_state_dict(ckpt, strict=False) 67 | torch.cuda.empty_cache() 68 | self.model = model.eval().cuda() 69 | self.tokenizer = tokenizer 70 | self.tokenizer.padding_side = 'left' 71 | self.image_proc = image_processor 72 | 73 | kwargs_default = dict(max_new_tokens=512, num_beams=3) 74 | kwargs_default.update(kwargs) 75 | self.kwargs = kwargs_default 76 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 77 | 78 | def generate_inner(self, message, dataset=None): 79 | vision_x = [] 80 | prompt = '' 81 | for msg in message: 82 | if msg['type'] == 'image': 83 | img = Image.open(msg['value']) 84 | vision_x.append(self.image_proc(img).unsqueeze(0)) 85 | prompt += '' 86 | elif msg['type'] == 'text': 87 | prompt += msg['value'] 88 | prompt += 'Answer: ' 89 | vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0] 90 | vision_x = vision_x.unsqueeze(1).unsqueeze(0) 91 | lang_x = self.tokenizer([prompt], return_tensors='pt') 92 | generated_text = self.model.generate( 93 | vision_x=vision_x.cuda(), 94 | lang_x=lang_x['input_ids'].cuda(), 95 | attention_mask=lang_x['attention_mask'].cuda(), 96 | **self.kwargs) 97 | generated_text = self.tokenizer.decode(generated_text[0]) 98 | text = generated_text[len(prompt):].split('<|endofchunk|>')[0] 99 | return text 100 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/paligemma.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class PaliGemma(BaseModel): 9 | INSTALL_REQ = False 10 | INTERLEAVE = False 11 | 12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs): 13 | try: 14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration 15 | except: 16 | warnings.warn('Please install the latest version transformers.') 17 | sys.exit(-1) 18 | model = PaliGemmaForConditionalGeneration.from_pretrained( 19 | model_path, 20 | torch_dtype=torch.bfloat16, 21 | device_map='cpu', 22 | revision='bfloat16', 23 | ).eval() 24 | self.model = model.cuda() 25 | self.processor = AutoProcessor.from_pretrained(model_path) 26 | self.kwargs = kwargs 27 | 28 | def generate_inner(self, message, dataset=None): 29 | prompt, image_path = self.message_to_promptimg(message) 30 | image = Image.open(image_path).convert('RGB') 31 | 32 | model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda') 33 | input_len = model_inputs['input_ids'].shape[-1] 34 | 35 | with torch.inference_mode(): 36 | generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False) 37 | generation = generation[0][input_len:] 38 | res = self.processor.decode(generation, skip_special_tokens=True) 39 | return res 40 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/pandagpt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import os.path as osp 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class PandaGPT(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = False 12 | 13 | def __init__(self, name, root=None, **kwargs): 14 | if root is None: 15 | warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ') 16 | sys.exit(-1) 17 | 18 | assert name == 'PandaGPT_13B' 19 | self.name = name 20 | sys.path.append(osp.join(root, 'code')) 21 | try: 22 | from model.openllama import OpenLLAMAPEFTModel 23 | except: 24 | raise ImportError( 25 | 'Please first install PandaGPT and set the root path to use PandaGPT, ' 26 | 'which is cloned from here: https://github.com/yxuansu/PandaGPT. ' 27 | ) 28 | self.args = { 29 | 'model': 'openllama_peft', 30 | 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'), 31 | 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'), 32 | 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'), 33 | 'stage': 2, 34 | 'max_tgt_len': 512, 35 | 'lora_r': 32, 36 | 'lora_alpha': 32, 37 | 'lora_dropout': 0.1, 38 | } 39 | model = OpenLLAMAPEFTModel(**self.args) 40 | delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) 41 | model.load_state_dict(delta_ckpt, strict=False) 42 | torch.cuda.empty_cache() 43 | self.model = model.eval().half().cuda() 44 | kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} 45 | kwargs_default.update(kwargs) 46 | self.kwargs = kwargs_default 47 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 48 | 49 | def generate_inner(self, message, dataset=None): 50 | prompt, image_path = self.message_to_promptimg(message) 51 | struct = { 52 | 'prompt': prompt, 53 | 'image_paths': [image_path], 54 | 'audio_paths': [], 55 | 'video_paths': [], 56 | 'thermal_paths': [], 57 | 'modality_embeds': [] 58 | } 59 | struct.update(self.kwargs) 60 | resp = self.model.generate(struct) 61 | return resp 62 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/phi3_vision.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class Phi3Vision(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs): 14 | try: 15 | from transformers import AutoProcessor, AutoModelForCausalLM 16 | except: 17 | warnings.warn('Please install the latest version transformers.') 18 | sys.exit(-1) 19 | model = AutoModelForCausalLM.from_pretrained( 20 | model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval() 21 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 22 | self.model = model 23 | self.processor = processor 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | image = Image.open(image_path).convert('RGB') 29 | messages = [ 30 | {'role': 'user', 'content': f'<|image_1|>\n{prompt}'} 31 | ] 32 | prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 33 | inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda') 34 | 35 | generation_args = { 36 | 'max_new_tokens': 500, 37 | 'temperature': 0.0, 38 | 'do_sample': False, 39 | } 40 | generation_args.update(self.kwargs) 41 | 42 | generate_ids = self.model.generate( 43 | **inputs, 44 | eos_token_id=self.processor.tokenizer.eos_token_id, 45 | **generation_args 46 | ) 47 | generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] 48 | response = self.processor.batch_decode( 49 | generate_ids, 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0] 53 | return response 54 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/qh_360vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import os.path as osp 5 | from PIL import Image 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..utils import DATASET_TYPE 9 | 10 | 11 | class QH_360VL(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='qihoo360/360VL-70B', **kwargs): 17 | assert model_path is not None 18 | self.model_path = model_path 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 20 | self.model = AutoModelForCausalLM.from_pretrained(model_path, 21 | torch_dtype=torch.float16, 22 | low_cpu_mem_usage=True, 23 | device_map='auto', 24 | trust_remote_code=True).eval() 25 | vision_tower = self.model.get_vision_tower() 26 | vision_tower.load_model() 27 | vision_tower.to(device='cuda', dtype=torch.float16) 28 | self.image_processor = vision_tower.image_processor 29 | self.tokenizer.pad_token = self.tokenizer.eos_token 30 | self.kwargs = kwargs 31 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 32 | torch.cuda.empty_cache() 33 | 34 | def generate(self, message, dataset=None): 35 | 36 | prompt, image_path = self.message_to_promptimg(message) 37 | print(prompt) 38 | image = Image.open(image_path).convert('RGB') 39 | terminators = [ 40 | self.tokenizer.convert_tokens_to_ids('<|eot_id|>',) 41 | ] 42 | inputs = self.model.build_conversation_input_ids(self.tokenizer, 43 | query=prompt, 44 | image=image, 45 | image_processor=self.image_processor) 46 | input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True) 47 | images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True) 48 | 49 | output_ids = self.model.generate(input_ids=input_ids, 50 | images=images, 51 | do_sample=False, 52 | num_beams=1, 53 | max_new_tokens=512, 54 | eos_token_id=terminators, 55 | use_cache=True) 56 | 57 | input_token_len = input_ids.shape[1] 58 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 59 | response = outputs.strip() 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/qwen_vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import copy as cp 5 | from .base import BaseModel 6 | from ..smp import isimg, listinstr 7 | from ..utils import DATASET_TYPE 8 | 9 | 10 | class QwenVL(BaseModel): 11 | 12 | INSTALL_REQ = False 13 | INTERLEAVE = True 14 | 15 | def __init__(self, model_path='Qwen/Qwen-VL', **kwargs): 16 | assert model_path is not None 17 | self.model_path = model_path 18 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 19 | tokenizer.padding_side = 'left' 20 | tokenizer.pad_token_id = tokenizer.eod_id 21 | self.tokenizer = tokenizer 22 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 23 | default_kwargs = dict( 24 | do_sample=False, 25 | num_beams=1, 26 | max_new_tokens=512, 27 | min_new_tokens=1, 28 | num_return_sequences=1, 29 | use_cache=True, 30 | output_hidden_states=True, 31 | pad_token_id=tokenizer.eod_id, 32 | eos_token_id=tokenizer.eod_id) 33 | default_kwargs.update(kwargs) 34 | self.kwargs = default_kwargs 35 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 36 | torch.cuda.empty_cache() 37 | 38 | def adjust_kwargs(self, dataset): 39 | kwargs = cp.deepcopy(self.kwargs) 40 | if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 41 | kwargs['max_new_tokens'] = 32 42 | elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset: 43 | kwargs['max_new_tokens'] = 32 44 | elif DATASET_TYPE(dataset) == 'VQA': 45 | if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset): 46 | kwargs['max_new_tokens'] = 100 47 | elif listinstr(['TextVQA'], dataset): 48 | kwargs['max_new_tokens'] = 10 49 | return kwargs 50 | 51 | def generate_inner(self, message, dataset=None): 52 | if dataset is not None: 53 | kwargs = self.adjust_kwargs(dataset) 54 | else: 55 | kwargs = self.kwargs 56 | prompt = '' 57 | for s in message: 58 | if s['type'] == 'image': 59 | prompt += f'{s["value"]}' 60 | elif s['type'] == 'text': 61 | prompt += s['value'] 62 | if dataset is not None and DATASET_TYPE(dataset) == 'VQA': 63 | prompt += ' Answer:' 64 | encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest') 65 | input_ids = encoded.input_ids.to('cuda') 66 | attention_mask = encoded.attention_mask.to('cuda') 67 | 68 | pred = self.model.generate( 69 | input_ids=input_ids, 70 | attention_mask=attention_mask, 71 | **kwargs) 72 | answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip() 73 | return answer 74 | 75 | 76 | class QwenVLChat(BaseModel): 77 | 78 | INSTALL_REQ = False 79 | INTERLEAVE = True 80 | 81 | def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs): 82 | assert model_path is not None 83 | self.model_path = model_path 84 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 85 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 86 | torch.cuda.empty_cache() 87 | self.kwargs = kwargs 88 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 89 | 90 | def generate_inner(self, message, dataset=None): 91 | vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message] 92 | query = self.tokenizer.from_list_format(vl_list) 93 | response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs) 94 | return response 95 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/visualglm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel 3 | from ..smp import * 4 | 5 | 6 | class VisualGLM(BaseModel): 7 | 8 | INSTALL_REQ = False 9 | INTERLEAVE = False 10 | 11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs): 12 | try: 13 | import sat 14 | except: 15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM') 16 | assert model_path is not None 17 | self.model_path = model_path 18 | 19 | from transformers import AutoModel 20 | from transformers import AutoTokenizer 21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() 23 | self.model = model 24 | self.kwargs = kwargs 25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 26 | 27 | def generate_inner(self, message, dataset=None): 28 | prompt, image_path = self.message_to_promptimg(message) 29 | output, _ = self.model.chat( 30 | image_path=image_path, 31 | tokenizer=self.tokenizer, 32 | query=prompt, 33 | history=[], 34 | **self.kwargs 35 | ) 36 | return output 37 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/wemm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import sys 4 | from ..smp import * 5 | from .base import BaseModel 6 | from ..utils import DATASET_TYPE 7 | from transformers import AutoModel, GenerationConfig 8 | 9 | 10 | class WeMM(BaseModel): 11 | def __init__(self, model_path='feipengma/WeMM', **kwargs): 12 | self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True) 13 | self.wemm.cuda() 14 | self.wemm.eval() 15 | torch.cuda.empty_cache() 16 | 17 | def use_custom_prompt(self, dataset): 18 | assert dataset is not None 19 | if DATASET_TYPE(dataset) == 'multi-choice': 20 | return True 21 | return False 22 | 23 | def build_prompt(self, line, dataset=None): 24 | assert self.use_custom_prompt(dataset) 25 | assert dataset is None or isinstance(dataset, str) 26 | tgt_path = self.dump_image(line, dataset) 27 | question = line['question'] 28 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 29 | if hint is not None: 30 | question = hint + '\n' + question 31 | options = { 32 | cand: line[cand] 33 | for cand in string.ascii_uppercase 34 | if cand in line and not pd.isna(line[cand]) 35 | } 36 | for key, item in options.items(): 37 | question += f'\n{key}. {item}' 38 | prompt = question 39 | 40 | if len(options): 41 | prompt += ( 42 | '\n请直接回答选项字母。' if cn_string(prompt) else 43 | "\nAnswer with the option's letter from the given choices directly." 44 | ) 45 | else: 46 | prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' 47 | 48 | message = [dict(type='text', value=prompt)] 49 | message.extend([dict(type='image', value=p) for p in tgt_path]) 50 | return message 51 | 52 | def generate_inner(self, message, dataset=None): 53 | prompt, image_path = self.message_to_promptimg(message) 54 | 55 | if dataset == 'HallusionBench': 56 | prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.' 57 | 58 | gen_config = None 59 | if dataset == 'MMVet': 60 | gen_config = GenerationConfig( 61 | max_new_tokens=512, 62 | do_sample=True, 63 | temperatures=0.7, 64 | num_beams=3, 65 | eos_token_id=self.wemm.tokenizer.eos_token_id, 66 | pad_token_id=self.wemm.tokenizer.pad_token_id 67 | if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id, 68 | ) 69 | pred = self.wemm.mm_generate(image_path, prompt, gen_config) 70 | 71 | return pred 72 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/xcomposer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sharecaptioner import ShareCaptioner 2 | from .xcomposer import XComposer 3 | from .xcomposer2 import XComposer2 4 | from .xcomposer2_4KHD import XComposer2_4KHD 5 | 6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD'] 7 | -------------------------------------------------------------------------------- /VLMEvalKit-main/vlmeval/vlm/xcomposer/sharecaptioner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from ..base import BaseModel 4 | from ...smp import * 5 | from ...utils import DATASET_TYPE 6 | 7 | 8 | class ShareCaptioner(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs): 14 | assert model_path is not None 15 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 16 | self.model = AutoModelForCausalLM.from_pretrained( 17 | model_path, device_map='cuda', trust_remote_code=True).eval() 18 | self.model.tokenizer = tokenizer 19 | self.model.cuda() 20 | self.model.half() 21 | 22 | def use_custom_prompt(self, dataset): 23 | assert dataset is not None 24 | if DATASET_TYPE(dataset) == 'multi-choice': 25 | return True 26 | return False 27 | 28 | def build_prompt(self, line, dataset=None): 29 | assert dataset is None or isinstance(dataset, str) 30 | assert self.use_custom_prompt(dataset) 31 | tgt_path = self.dump_image(line, dataset) 32 | 33 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 34 | question = line['question'] 35 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 36 | if hint is not None: 37 | question = hint + '\n' + question 38 | 39 | option_candidate = string.ascii_uppercase 40 | options = { 41 | cand: line[cand] 42 | for cand in option_candidate 43 | if cand in line and not pd.isna(line[cand]) 44 | } 45 | for key, item in options.items(): 46 | question += f'\n{key}. {item}' 47 | prompt = question 48 | 49 | if not cn_string(prompt): 50 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." 51 | else: 52 | prompt = prompt + '\n' + '请直接回答选项字母。' 53 | else: 54 | prompt = line['question'] 55 | message = [dict(type='text', value=prompt)] 56 | message.extend([dict(type='image', value=s) for s in tgt_path]) 57 | return message 58 | 59 | def generate_inner(self, message, dataset=None): 60 | prompt, image_path = self.message_to_promptimg(message) 61 | seg1 = '<|User|>:' 62 | seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:' 63 | self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True) 64 | self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False) 65 | 66 | image = Image.open(image_path).convert('RGB') 67 | image = self.model.vis_processor(image).unsqueeze(0) 68 | image = image.to(self.model.device) 69 | tmp_bs = image.shape[0] 70 | tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1) 71 | tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1) 72 | with torch.cuda.amp.autocast(): 73 | with torch.no_grad(): 74 | image = self.model.encode_img(image) 75 | input_emb = torch.cat( 76 | [tmp_seg_emb1, image, tmp_seg_emb2], dim=1) 77 | out_embeds = self.model.internlm_model.generate( 78 | inputs_embeds=input_emb, 79 | max_length=500, 80 | num_beams=3, 81 | min_length=1, 82 | do_sample=True, 83 | repetition_penalty=1.5, 84 | length_penalty=1.0, 85 | temperature=1., 86 | eos_token_id=self.model.tokenizer.eos_token_id, 87 | num_return_sequences=1) 88 | 89 | for j, out in enumerate(out_embeds): 90 | out[out == -1] = 2 91 | response = self.model.decode_text([out]) 92 | return response 93 | -------------------------------------------------------------------------------- /assets/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/apple.jpg -------------------------------------------------------------------------------- /assets/metatask_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/metatask_eval.png -------------------------------------------------------------------------------- /assets/overall_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/overall_progress.png -------------------------------------------------------------------------------- /assets/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/overview.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | einops 2 | gradio==4.15.0 3 | huggingface_hub 4 | matplotlib 5 | numpy>=1.23.4 6 | omegaconf 7 | openai==1.3.5 8 | opencv-python>=4.4.0.46 9 | openpyxl 10 | pandas>=1.5.3 11 | pillow 12 | portalocker 13 | protobuf 14 | pycocoevalcap 15 | python-dotenv 16 | requests 17 | rich 18 | seaborn 19 | sentencepiece 20 | sty 21 | tabulate 22 | tiktoken 23 | timeout-decorator 24 | torch>=2.0.1 25 | tqdm 26 | transformers 27 | typing_extensions==4.7.1 28 | validators 29 | visual_genome 30 | xlsxwriter 31 | xtuner 32 | -------------------------------------------------------------------------------- /vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | pass 5 | 6 | from .smp import * 7 | from .api import * 8 | from .evaluate import * 9 | from .utils import * 10 | from .vlm import * 11 | from .config import * 12 | from .tools import cli 13 | 14 | load_env() 15 | -------------------------------------------------------------------------------- /vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal 3 | from .hf_chat_model import HFChatModel 4 | from .gemini import GeminiWrapper, GeminiProVision 5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI 6 | from .qwen_api import QwenAPI 7 | from .stepai import Step1V_INT 8 | from .claude import Claude_Wrapper, Claude3V 9 | from .reka import Reka 10 | from .glm_vision import GLMVisionAPI 11 | from .cloudwalk import CWWrapper 12 | 13 | __all__ = [ 14 | 'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper', 15 | 'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 16 | 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI', 17 | 'CWWrapper' 18 | ] 19 | -------------------------------------------------------------------------------- /vlmeval/api/claude.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import base64 5 | import mimetypes 6 | 7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat' 8 | headers = { 9 | 'alles-apin-token': '', 10 | 'Content-Type': 'application/json' 11 | } 12 | 13 | 14 | class Claude_Wrapper(BaseAPI): 15 | 16 | is_api: bool = True 17 | 18 | def __init__(self, 19 | model: str = 'claude-3-opus-20240229', 20 | key: str = None, 21 | retry: int = 10, 22 | wait: int = 3, 23 | system_prompt: str = None, 24 | verbose: bool = True, 25 | temperature: float = 0, 26 | max_tokens: int = 1024, 27 | **kwargs): 28 | 29 | self.model = model 30 | self.headers = headers 31 | self.temperature = temperature 32 | self.max_tokens = max_tokens 33 | if key is not None: 34 | self.key = key 35 | else: 36 | self.key = os.environ.get('ALLES', '') 37 | self.headers['alles-apin-token'] = self.key 38 | 39 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 40 | 41 | def build_msgs(self, msgs_raw): 42 | 43 | messages = [] 44 | message = {'role': 'user', 'content': []} 45 | for msg in msgs_raw: 46 | if msg['type'] == 'image': 47 | pth = msg['value'] 48 | suffix = osp.splitext(pth)[-1].lower() 49 | media_type = mimetypes.types_map.get(suffix, None) 50 | assert media_type is not None 51 | 52 | item = { 53 | 'type': 'image', 54 | 'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)} 55 | } 56 | 57 | elif msg['type'] == 'text': 58 | item = {'type': 'text', 'text': msg['value']} 59 | else: 60 | raise NotImplementedError(f'Unsupported message type: {msg["type"]}') 61 | 62 | message['content'].append(item) 63 | messages.append(message) 64 | return messages 65 | 66 | def generate_inner(self, inputs, **kwargs) -> str: 67 | 68 | payload = json.dumps({ 69 | 'model': self.model, 70 | 'max_tokens': self.max_tokens, 71 | 'messages': self.build_msgs(msgs_raw=inputs), 72 | **kwargs 73 | }) 74 | response = requests.request('POST', url, headers=headers, data=payload) 75 | 76 | ret_code = response.status_code 77 | retry = self.retry 78 | while ret_code == 429 and retry > 0: 79 | sleep(15) 80 | response = requests.request('POST', url, headers=headers, data=payload) 81 | ret_code = response.status_code 82 | retry -= 1 83 | 84 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 85 | answer = self.fail_msg 86 | 87 | try: 88 | resp_struct = json.loads(response.text) 89 | answer = resp_struct['data']['content'][0]['text'].strip() 90 | except: 91 | pass 92 | 93 | return ret_code, answer, response 94 | 95 | 96 | class Claude3V(Claude_Wrapper): 97 | 98 | def generate(self, message, dataset=None): 99 | return super(Claude_Wrapper, self).generate(message) 100 | -------------------------------------------------------------------------------- /vlmeval/api/cloudwalk.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | import os 3 | from .base import BaseAPI 4 | 5 | 6 | class CWWrapper(BaseAPI): 7 | 8 | is_api: bool = True 9 | 10 | def __init__(self, 11 | model: str = 'cw-congrong-v1.5', 12 | retry: int = 10, 13 | wait: int = 5, 14 | key: str = None, 15 | verbose: bool = True, 16 | system_prompt: str = None, 17 | temperature: float = 0, 18 | timeout: int = 600, 19 | api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions', 20 | max_tokens: int = 1024, 21 | img_size: int = 512, 22 | img_detail: str = 'low', 23 | **kwargs): 24 | 25 | self.model = model 26 | self.cur_idx = 0 27 | self.fail_msg = 'Failed to obtain answer via API. ' 28 | self.max_tokens = max_tokens 29 | self.temperature = temperature 30 | 31 | base = os.environ.get('CW_API_BASE', None) 32 | self.api_base = base if base is not None else api_base 33 | 34 | env_key = os.environ.get('CW_API_KEY', None) 35 | self.key = env_key if env_key is not None else key 36 | assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \ 37 | pass it to the constructor.' 38 | 39 | assert img_size > 0 or img_size == -1 40 | self.img_size = -1 # allways send full size image 41 | assert img_detail in ['high', 'low'] 42 | self.img_detail = img_detail 43 | 44 | self.vision = True 45 | self.timeout = timeout 46 | 47 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] 50 | # content can be a string or a list of image & text 51 | def prepare_inputs(self, inputs): 52 | input_msgs = [] 53 | if self.system_prompt is not None: 54 | input_msgs.append(dict(role='system', content=self.system_prompt)) 55 | has_images = np.sum([x['type'] == 'image' for x in inputs]) 56 | if has_images: 57 | content_list = [] 58 | for msg in inputs: 59 | if msg['type'] == 'text': 60 | content_list.append(dict(type='text', text=msg['value'])) 61 | elif msg['type'] == 'image': 62 | from PIL import Image 63 | img = Image.open(msg['value']) 64 | b64 = encode_image_to_base64(img, target_size=self.img_size) 65 | img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail) 66 | content_list.append(dict(type='image_url', image_url=img_struct)) 67 | input_msgs.append(dict(role='user', content=content_list)) 68 | else: 69 | assert all([x['type'] == 'text' for x in inputs]) 70 | text = '\n'.join([x['value'] for x in inputs]) 71 | input_msgs.append(dict(role='user', content=text)) 72 | return input_msgs 73 | 74 | def generate_inner(self, inputs, **kwargs) -> str: 75 | input_msgs = self.prepare_inputs(inputs) 76 | temperature = kwargs.pop('temperature', self.temperature) 77 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 78 | 79 | if 0 < max_tokens <= 100: 80 | self.logger.warning( 81 | 'Less than 100 tokens left, ' 82 | 'may exceed the context window with some additional meta symbols. ' 83 | ) 84 | if max_tokens <= 0: 85 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 86 | 87 | headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'} 88 | payload = dict( 89 | model=self.model, 90 | messages=input_msgs, 91 | max_tokens=max_tokens, 92 | n=1, 93 | temperature=temperature, 94 | **kwargs) 95 | response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 96 | ret_code = response.status_code 97 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 98 | answer = self.fail_msg 99 | try: 100 | resp_struct = json.loads(response.text) 101 | answer = resp_struct['choices'][0]['message']['content'].strip() 102 | except: 103 | pass 104 | return ret_code, answer, response 105 | -------------------------------------------------------------------------------- /vlmeval/api/glm_vision.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from vlmeval.utils.dataset import DATASET_TYPE 4 | from vlmeval.smp.vlm import encode_image_file_to_base64 5 | 6 | 7 | class GLMVisionWrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | 11 | def __init__(self, 12 | model: str, 13 | retry: int = 5, 14 | wait: int = 5, 15 | key: str = None, 16 | verbose: bool = True, 17 | system_prompt: str = None, 18 | max_tokens: int = 1024, 19 | proxy: str = None, 20 | **kwargs): 21 | 22 | self.model = model 23 | self.fail_msg = 'Failed to obtain answer via API. ' 24 | self.default_params = { 25 | 'top_p': 0.6, 26 | 'top_k': 2, 27 | 'temperature': 0.8, 28 | 'repetition_penalty': 1.1, 29 | 'best_of': 1, 30 | 'do_sample': True, 31 | 'stream': False, 32 | 'max_tokens': max_tokens 33 | } 34 | if key is None: 35 | key = os.environ.get('GLMV_API_KEY', None) 36 | assert key is not None, ( 37 | 'Please set the API Key (obtain it here: ' 38 | 'https://open.bigmodel.cn/dev/howuse/introduction)' 39 | ) 40 | self.key = key 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | def image_to_base64(self, image_path): 44 | import base64 45 | with open(image_path, 'rb') as image_file: 46 | encoded_string = base64.b64encode(image_file.read()) 47 | return encoded_string.decode('utf-8') 48 | 49 | def build_msgs(self, msgs_raw, system_prompt=None, dataset=None): 50 | msgs = cp.deepcopy(msgs_raw) 51 | content = [] 52 | text = '' 53 | for i, msg in enumerate(msgs): 54 | if msg['type'] == 'text': 55 | text += msg['value'] 56 | elif msg['type'] == 'image': 57 | content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value'])))) 58 | if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 59 | text += '\nShort Answer.' 60 | content.append(dict(type='text', text=text)) 61 | ret = [dict(role='user', content=content)] 62 | return ret 63 | 64 | def generate_inner(self, inputs, **kwargs) -> str: 65 | assert isinstance(inputs, str) or isinstance(inputs, list) 66 | inputs = [inputs] if isinstance(inputs, str) else inputs 67 | 68 | messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None)) 69 | 70 | url = 'https://api.chatglm.cn/v1/chat/completions' 71 | headers = { 72 | 'Content-Type': 'application/json', 73 | 'Request-Id': 'remote-test', 74 | 'Authorization': f'Bearer {self.key}' 75 | } 76 | payload = { 77 | 'model': self.model, 78 | 'messages': messages, 79 | **self.default_params 80 | } 81 | response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False) 82 | output = [] 83 | try: 84 | assert response.status_code == 200 85 | for line in response.iter_lines(): 86 | data = json.loads(line.decode('utf-8').lstrip('data: ')) 87 | output.append(data['choices'][0]['message']['content']) 88 | answer = ''.join(output).replace('', '') 89 | if self.verbose: 90 | self.logger.info(f'inputs: {inputs}\nanswer: {answer}') 91 | return 0, answer, 'Succeeded! ' 92 | except Exception as err: 93 | if self.verbose: 94 | self.logger.error(err) 95 | self.logger.error(f'The input messages are {inputs}.') 96 | return -1, self.fail_msg, '' 97 | 98 | 99 | class GLMVisionAPI(GLMVisionWrapper): 100 | 101 | def generate(self, message, dataset=None): 102 | return super(GLMVisionAPI, self).generate(message, dataset=dataset) 103 | -------------------------------------------------------------------------------- /vlmeval/api/gpt_int.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | import requests 4 | from ..smp import * 5 | from .gpt import GPT_context_window, OpenAIWrapper 6 | 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat' 8 | headers = { 9 | 'Content-Type': 'application/json' 10 | } 11 | 12 | 13 | class OpenAIWrapperInternal(OpenAIWrapper): 14 | 15 | is_api: bool = True 16 | 17 | def __init__(self, 18 | model: str = 'gpt-3.5-turbo-0125', 19 | retry: int = 5, 20 | wait: int = 3, 21 | verbose: bool = True, 22 | system_prompt: str = None, 23 | temperature: float = 0, 24 | timeout: int = 60, 25 | max_tokens: int = 1024, 26 | img_size: int = 512, 27 | img_detail: str = 'low', 28 | **kwargs): 29 | 30 | self.model = model 31 | if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']): 32 | keys = load(os.environ['KEYS']) 33 | headers['alles-apin-token'] = keys.get('alles-apin-token', '') 34 | elif 'ALLES' in os.environ: 35 | headers['alles-apin-token'] = os.environ['ALLES'] 36 | self.headers = headers 37 | self.temperature = temperature 38 | self.timeout = timeout 39 | self.max_tokens = max_tokens 40 | 41 | assert img_size > 0 or img_size == -1 42 | self.img_size = img_size 43 | assert img_detail in ['high', 'low'] 44 | self.img_detail = img_detail 45 | 46 | super(OpenAIWrapper, self).__init__( 47 | wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 48 | 49 | def generate_inner(self, inputs, **kwargs) -> str: 50 | input_msgs = self.prepare_inputs(inputs) 51 | 52 | temperature = kwargs.pop('temperature', self.temperature) 53 | max_tokens = kwargs.pop('max_tokens', self.max_tokens) 54 | 55 | # Held out 100 tokens as buffer 56 | context_window = GPT_context_window(self.model) 57 | max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) 58 | if 0 < max_tokens <= 100: 59 | print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ') 60 | if max_tokens <= 0: 61 | return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' 62 | 63 | payload = dict( 64 | model=self.model, 65 | messages=input_msgs, 66 | max_tokens=max_tokens, 67 | n=1, 68 | stop=None, 69 | timeout=self.timeout, 70 | temperature=temperature, 71 | **kwargs) 72 | 73 | response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) 74 | ret_code = response.status_code 75 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 76 | 77 | answer = self.fail_msg 78 | try: 79 | resp_struct = json.loads(response.text) 80 | assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct 81 | answer = resp_struct['data']['choices'][0]['message']['content'].strip() 82 | except: 83 | pass 84 | return ret_code, answer, response 85 | 86 | 87 | class GPT4V_Internal(OpenAIWrapperInternal): 88 | 89 | def generate(self, message, dataset=None): 90 | return super(GPT4V_Internal, self).generate(message) 91 | -------------------------------------------------------------------------------- /vlmeval/api/qwen_api.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | import os 3 | from vlmeval.api.base import BaseAPI 4 | from vlmeval.smp import * 5 | 6 | 7 | # Note: This is a pure language model API. 8 | class QwenAPI(BaseAPI): 9 | 10 | is_api: bool = True 11 | 12 | def __init__(self, 13 | model: str = 'qwen-max-1201', 14 | retry: int = 5, 15 | wait: int = 5, 16 | verbose: bool = True, 17 | seed: int = 2680, 18 | temperature: float = 0.0, 19 | system_prompt: str = None, 20 | key: str = None, 21 | max_tokens: int = 1024, 22 | proxy: str = None, 23 | **kwargs): 24 | 25 | assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext'] 26 | self.model = model 27 | import dashscope 28 | self.fail_msg = 'Failed to obtain answer via API. ' 29 | self.max_tokens = max_tokens 30 | self.temperature = temperature 31 | self.seed = seed 32 | if key is None: 33 | key = os.environ.get('DASHSCOPE_API_KEY', None) 34 | assert key is not None, ( 35 | 'Please set the API Key (obtain it here: ' 36 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 37 | ) 38 | dashscope.api_key = key 39 | if proxy is not None: 40 | proxy_set(proxy) 41 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 42 | 43 | @staticmethod 44 | def build_msgs(msgs_raw, system_prompt=None): 45 | msgs = cp.deepcopy(msgs_raw) 46 | ret = [] 47 | if system_prompt is not None: 48 | ret.append(dict(role='system', content=system_prompt)) 49 | for i, msg in enumerate(msgs): 50 | role = 'user' if i % 2 == 0 else 'assistant' 51 | ret.append(dict(role=role, content=msg)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | inputs = [inputs] if isinstance(inputs, str) else inputs 58 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 59 | 60 | import dashscope 61 | response = dashscope.Generation.call( 62 | model=self.model, 63 | messages=messages, 64 | seed=self.seed, 65 | temperature=self.temperature, 66 | max_tokens=self.max_tokens, 67 | result_format='message', # set the result to be "message" format. 68 | ) 69 | if response.status_code != HTTPStatus.OK: 70 | return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. ' 71 | 72 | try: 73 | return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! ' 74 | except Exception as err: 75 | return -1, f'Error: Failed to parse the response. {err}', response 76 | -------------------------------------------------------------------------------- /vlmeval/api/qwen_vl_api.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | 5 | class QwenVLWrapper(BaseAPI): 6 | 7 | is_api: bool = True 8 | 9 | def __init__(self, 10 | model: str = 'qwen-vl-plus', 11 | retry: int = 5, 12 | wait: int = 5, 13 | key: str = None, 14 | verbose: bool = True, 15 | temperature: float = 0.0, 16 | system_prompt: str = None, 17 | max_tokens: int = 1024, 18 | proxy: str = None, 19 | **kwargs): 20 | 21 | assert model in ['qwen-vl-plus', 'qwen-vl-max'] 22 | self.model = model 23 | import dashscope 24 | self.fail_msg = 'Failed to obtain answer via API. ' 25 | self.max_tokens = max_tokens 26 | self.temperature = temperature 27 | if key is None: 28 | key = os.environ.get('DASHSCOPE_API_KEY', None) 29 | assert key is not None, ( 30 | 'Please set the API Key (obtain it here: ' 31 | 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' 32 | ) 33 | dashscope.api_key = key 34 | if proxy is not None: 35 | proxy_set(proxy) 36 | super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) 37 | 38 | @staticmethod 39 | def build_msgs(msgs_raw, system_prompt=None): 40 | msgs = cp.deepcopy(msgs_raw) 41 | ret = [] 42 | if system_prompt is not None: 43 | content = list(dict(text=system_prompt)) 44 | ret.append(dict(role='system', content=content)) 45 | content = [] 46 | for msg in msgs: 47 | if msg['type'] == 'text': 48 | content.append(dict(text=msg['value'])) 49 | elif msg['type'] == 'image': 50 | content.append(dict(image='file://' + msg['value'])) 51 | ret.append(dict(role='user', content=content)) 52 | return ret 53 | 54 | def generate_inner(self, inputs, **kwargs) -> str: 55 | from dashscope import MultiModalConversation 56 | assert isinstance(inputs, str) or isinstance(inputs, list) 57 | pure_text = np.all([x['type'] == 'text' for x in inputs]) 58 | assert not pure_text 59 | messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) 60 | gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature) 61 | gen_config.update(kwargs) 62 | try: 63 | response = MultiModalConversation.call(model=self.model, messages=messages) 64 | if self.verbose: 65 | print(response) 66 | answer = response.output.choices[0]['message']['content'][0]['text'] 67 | return 0, answer, 'Succeeded! ' 68 | except Exception as err: 69 | if self.verbose: 70 | self.logger.error(err) 71 | self.logger.error(f'The input messages are {inputs}.') 72 | 73 | return -1, '', '' 74 | 75 | 76 | class QwenVLAPI(QwenVLWrapper): 77 | 78 | def generate(self, message, dataset=None): 79 | return super(QwenVLAPI, self).generate(message) 80 | -------------------------------------------------------------------------------- /vlmeval/api/reka.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import mimetypes 5 | 6 | 7 | class Reka_Wrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | INTERLEAVE: bool = False 11 | 12 | def __init__(self, 13 | model: str = 'reka-flash-20240226', 14 | key: str = None, 15 | retry: int = 10, 16 | wait: int = 3, 17 | system_prompt: str = None, 18 | verbose: bool = True, 19 | temperature: float = 0, 20 | max_tokens: int = 1024, 21 | **kwargs): 22 | 23 | try: 24 | import reka 25 | except ImportError: 26 | raise ImportError('Please install reka by running "pip install reka-api"') 27 | 28 | self.model = model 29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) 30 | default_kwargs.update(kwargs) 31 | self.kwargs = default_kwargs 32 | if key is not None: 33 | self.key = key 34 | else: 35 | self.key = os.environ.get('REKA_API_KEY', '') 36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 37 | 38 | def generate_inner(self, inputs, **kwargs) -> str: 39 | import reka 40 | reka.API_KEY = self.key 41 | prompt, image_path = self.message_to_promptimg(inputs) 42 | image_b64 = encode_image_file_to_base64(image_path) 43 | 44 | response = reka.chat( 45 | model_name=self.model, 46 | human=prompt, 47 | media_url=f'data:image/jpeg;base64,{image_b64}', 48 | **self.kwargs) 49 | 50 | try: 51 | return 0, response['text'], response 52 | except: 53 | return -1, self.fail_msg, response 54 | 55 | 56 | class Reka(Reka_Wrapper): 57 | 58 | def generate(self, message, dataset=None): 59 | return super(Reka_Wrapper, self).generate(message) 60 | -------------------------------------------------------------------------------- /vlmeval/api/stepai.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | 4 | url = 'https://api.stepfun.com/v1/chat/completions' 5 | headers = { 6 | 'Content-Type': 'application/json', 7 | 'Authorization': 'Bearer {}', 8 | } 9 | 10 | 11 | class StepAPI_INT(BaseAPI): 12 | 13 | is_api: bool = True 14 | 15 | def __init__(self, 16 | model: str = 'step-1v-8k', 17 | retry: int = 10, 18 | wait: int = 3, 19 | key: str = None, 20 | temperature: float = 0, 21 | max_tokens: int = 300, 22 | verbose: bool = True, 23 | system_prompt: str = None, 24 | **kwargs): 25 | self.model = model 26 | self.fail_msg = 'Fail to obtain answer via API.' 27 | self.headers = headers 28 | self.temperature = temperature 29 | self.max_tokens = max_tokens 30 | self.system_prompt = system_prompt 31 | if key is not None: 32 | self.key = key 33 | else: 34 | self.key = os.environ.get('STEPAI_API_KEY', '') 35 | headers['Authorization'] = headers['Authorization'].format(self.key) 36 | 37 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 38 | 39 | @staticmethod 40 | def build_msgs(msgs_raw): 41 | messages = [] 42 | message = {'role': 'user', 'content': []} 43 | 44 | for msg in msgs_raw: 45 | if msg['type'] == 'image': 46 | image_b64 = encode_image_file_to_base64(msg['value']) 47 | message['content'].append({ 48 | 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)}, 49 | 'type': 'image_url' 50 | }) 51 | elif msg['type'] == 'text': 52 | message['content'].append({ 53 | 'text': msg['value'], 54 | 'type': 'text' 55 | }) 56 | 57 | messages.append(message) 58 | return messages 59 | 60 | def generate_inner(self, inputs, **kwargs) -> str: 61 | print(inputs, '\n') 62 | payload = dict( 63 | model=self.model, 64 | max_tokens=self.max_tokens, 65 | temperature=self.temperature, 66 | messages=self.build_msgs(msgs_raw=inputs), 67 | **kwargs) 68 | response = requests.post(url, headers=headers, data=json.dumps(payload)) 69 | ret_code = response.status_code 70 | ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code 71 | 72 | answer = self.fail_msg 73 | try: 74 | resp_struct = json.loads(response.text) 75 | answer = resp_struct['choices'][0]['message']['content'].strip() 76 | except: 77 | pass 78 | return ret_code, answer, response 79 | 80 | 81 | class Step1V_INT(StepAPI_INT): 82 | 83 | def generate(self, message, dataset=None): 84 | return super(StepAPI_INT, self).generate(message) 85 | -------------------------------------------------------------------------------- /vlmeval/evaluate/OCRBench.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | 3 | 4 | def OCRBench_eval(eval_file): 5 | OCRBench_score = { 6 | 'Regular Text Recognition': 0, 7 | 'Irregular Text Recognition': 0, 8 | 'Artistic Text Recognition': 0, 9 | 'Handwriting Recognition': 0, 10 | 'Digit String Recognition': 0, 11 | 'Non-Semantic Text Recognition': 0, 12 | 'Scene Text-centric VQA': 0, 13 | 'Doc-oriented VQA': 0, 14 | 'Key Information Extraction': 0, 15 | 'Handwritten Mathematical Expression Recognition': 0 16 | } 17 | 18 | logger = get_logger('Evaluation') 19 | 20 | data = load(eval_file) 21 | lt = len(data) 22 | lines = [data.iloc[i] for i in range(lt)] 23 | for i in tqdm(range(len(lines))): 24 | line = lines[i] 25 | predict = str(line['prediction']) 26 | answers = eval(line['answer']) 27 | category = line['category'] 28 | if category == 'Handwritten Mathematical Expression Recognition': 29 | for j in range(len(answers)): 30 | answer = answers[j].strip().replace('\n', ' ').replace(' ', '') 31 | predict = predict.strip().replace('\n', ' ').replace(' ', '') 32 | if answer in predict: 33 | OCRBench_score[category] += 1 34 | break 35 | else: 36 | for j in range(len(answers)): 37 | answer = answers[j].lower().strip().replace('\n', ' ') 38 | predict = predict.lower().strip().replace('\n', ' ') 39 | if answer in predict: 40 | OCRBench_score[category] += 1 41 | break 42 | 43 | final_score_dict = {} 44 | final_score_dict['Text Recognition'] = ( 45 | OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] 46 | + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] 47 | + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'] 48 | ) 49 | final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] 50 | final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] 51 | final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] 52 | final_score_dict['Handwritten Mathematical Expression Recognition'] = \ 53 | OCRBench_score['Handwritten Mathematical Expression Recognition'] 54 | final_score_dict['Final Score'] = ( 55 | final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] 56 | + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] 57 | + final_score_dict['Handwritten Mathematical Expression Recognition'] 58 | ) 59 | final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10 60 | score_pth = eval_file.replace('.xlsx', '_score.json') 61 | dump(final_score_dict, score_pth) 62 | logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 63 | logger.info('Score: ') 64 | for key, value in final_score_dict.items(): 65 | logger.info('{}:{}'.format(key, value)) 66 | -------------------------------------------------------------------------------- /vlmeval/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval 2 | from .mmvet_eval import MMVet_eval 3 | from .multiple_choice import multiple_choice_eval 4 | from .coco_eval import COCO_eval 5 | from .vqa_eval import VQAEval 6 | from .mathvista_eval import MathVista_eval 7 | from .llavabench import LLaVABench_eval 8 | from .misc import build_judge 9 | from .OCRBench import OCRBench_eval 10 | -------------------------------------------------------------------------------- /vlmeval/evaluate/coco_eval.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from pycocoevalcap.bleu.bleu import Bleu 3 | from pycocoevalcap.rouge.rouge import Rouge 4 | from pycocoevalcap.cider.cider import Cider 5 | 6 | 7 | class COCO_Caption_Scorer(): 8 | def __init__(self, ref, gt): 9 | self.ref = ref 10 | self.gt = gt 11 | print('setting up scorers...') 12 | self.scorers = [ 13 | (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), 14 | # (Meteor(), "METEOR"), # need java version 11.0.16+ 15 | (Rouge(), 'ROUGE_L'), 16 | (Cider(), 'CIDEr'), 17 | # (Spice(), "SPICE"), # need java version 11.0.16+ 18 | ] 19 | 20 | def compute_scores(self): 21 | total_scores = {} 22 | for scorer, method in self.scorers: 23 | print('computing %s score...' % (scorer.method())) 24 | score, scores = scorer.compute_score(self.gt, self.ref) 25 | if type(method) == list: 26 | for sc, scs, m in zip(score, scores, method): 27 | print('%s: %0.3f' % (m, sc * 100)) 28 | total_scores['Bleu'] = [x * 100 for x in score] 29 | else: 30 | print('%s: %0.3f' % (method, score * 100)) 31 | total_scores[method] = score * 100 32 | 33 | print('*****DONE*****') 34 | for key, value in total_scores.items(): 35 | print('{}:{}'.format(key, value)) 36 | return total_scores 37 | 38 | 39 | def COCO_eval(eval_file, nproc=4, verbose=False): 40 | logger = get_logger('Evaluation') 41 | 42 | data = load(eval_file) 43 | 44 | lt = len(data) 45 | lines = [data.iloc[i] for i in range(lt)] 46 | ref = {} 47 | gt = {} 48 | for i, line in enumerate(lines): 49 | ref[str(i)] = [str(line['prediction'])] 50 | gt[str(i)] = eval(line['answer']) 51 | 52 | scorer = COCO_Caption_Scorer(ref, gt) 53 | coco_caption_score_dict = scorer.compute_scores() 54 | 55 | score_pth = eval_file.replace('.xlsx', '_score.json') 56 | dump(coco_caption_score_dict, score_pth) 57 | logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') 58 | logger.info('Score: ') 59 | for key, value in coco_caption_score_dict.items(): 60 | logger.info('{}:{}'.format(key, value)) 61 | 62 | 63 | def parse_args(): 64 | parser = argparse.ArgumentParser(description='Inference LLM Answers. ') 65 | parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ') 66 | parser.add_argument('--nproc', type=int, default=4) 67 | parser.add_argument('--verbose', action='store_true') 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose) 75 | -------------------------------------------------------------------------------- /vlmeval/evaluate/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal 3 | from vlmeval.smp import load_env 4 | 5 | INTERNAL = os.environ.get('INTERNAL', 0) 6 | 7 | 8 | def build_judge(**kwargs): 9 | model = kwargs.pop('model', None) 10 | load_env() 11 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 12 | if LOCAL_LLM is None: 13 | model_map = { 14 | 'gpt-4-turbo': 'gpt-4-1106-preview', 15 | 'gpt-4-0613': 'gpt-4-0613', 16 | 'gpt-4-0125': 'gpt-4-0125-preview', 17 | 'gpt-4-0409': 'gpt-4-turbo-2024-04-09', 18 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 19 | 'chatgpt-0125': 'gpt-3.5-turbo-0125', 20 | } 21 | model_version = model_map[model] 22 | else: 23 | model_version = LOCAL_LLM 24 | if INTERNAL: 25 | model = OpenAIWrapperInternal(model_version, **kwargs) 26 | else: 27 | model = OpenAIWrapper(model_version, **kwargs) 28 | return model 29 | -------------------------------------------------------------------------------- /vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 7 | logger = logging.getLogger(name) 8 | if name in logger_initialized: 9 | return logger 10 | 11 | for logger_name in logger_initialized: 12 | if name.startswith(logger_name): 13 | return logger 14 | 15 | stream_handler = logging.StreamHandler() 16 | handlers = [stream_handler] 17 | 18 | try: 19 | import torch.distributed as dist 20 | if dist.is_available() and dist.is_initialized(): 21 | rank = dist.get_rank() 22 | else: 23 | rank = 0 24 | except ImportError: 25 | rank = 0 26 | 27 | if rank == 0 and log_file is not None: 28 | file_handler = logging.FileHandler(log_file, file_mode) 29 | handlers.append(file_handler) 30 | 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 33 | for handler in handlers: 34 | handler.setFormatter(formatter) 35 | handler.setLevel(log_level) 36 | logger.addHandler(handler) 37 | 38 | if rank == 0: 39 | logger.setLevel(log_level) 40 | else: 41 | logger.setLevel(logging.ERROR) 42 | 43 | logger_initialized[name] = True 44 | return logger 45 | -------------------------------------------------------------------------------- /vlmeval/smp/vlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import pandas as pd 4 | import numpy as np 5 | import string 6 | from uuid import uuid4 7 | import os.path as osp 8 | import base64 9 | from PIL import Image 10 | from .file import load, dump 11 | Image.MAX_IMAGE_PIXELS = 1e9 12 | 13 | 14 | def mmqa_display(question, target_size=512): 15 | question = {k.lower(): v for k, v in question.items()} 16 | keys = list(question.keys()) 17 | keys = [k for k in keys if k not in ['index', 'image']] 18 | 19 | images = question['image'] 20 | if isinstance(images, str): 21 | images = [images] 22 | 23 | idx = question.pop('index', 'XXX') 24 | print(f'INDEX: {idx}') 25 | 26 | for im in images: 27 | image = decode_base64_to_image(im, target_size=target_size) 28 | display(image) # noqa: F821 29 | 30 | for k in keys: 31 | try: 32 | if not pd.isna(question[k]): 33 | print(f'{k.upper()}. {question[k]}') 34 | except ValueError: 35 | if False in pd.isna(question[k]): 36 | print(f'{k.upper()}. {question[k]}') 37 | 38 | 39 | def encode_image_to_base64(img, target_size=-1): 40 | # if target_size == -1, will not do resizing 41 | # else, will set the max_size ot (target_size, target_size) 42 | if img.mode in ('RGBA', 'P'): 43 | img = img.convert('RGB') 44 | tmp = osp.join('/tmp', str(uuid4()) + '.jpg') 45 | if target_size > 0: 46 | img.thumbnail((target_size, target_size)) 47 | img.save(tmp) 48 | with open(tmp, 'rb') as image_file: 49 | image_data = image_file.read() 50 | ret = base64.b64encode(image_data).decode('utf-8') 51 | os.remove(tmp) 52 | return ret 53 | 54 | 55 | def encode_image_file_to_base64(image_path, target_size=-1): 56 | image = Image.open(image_path) 57 | return encode_image_to_base64(image, target_size=target_size) 58 | 59 | 60 | def decode_base64_to_image(base64_string, target_size=-1): 61 | image_data = base64.b64decode(base64_string) 62 | image = Image.open(io.BytesIO(image_data)) 63 | if image.mode in ('RGBA', 'P'): 64 | image = image.convert('RGB') 65 | if target_size > 0: 66 | image.thumbnail((target_size, target_size)) 67 | return image 68 | 69 | 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1): 71 | image = decode_base64_to_image(base64_string, target_size=target_size) 72 | image.save(image_path) 73 | 74 | 75 | def build_option_str(option_dict): 76 | s = 'There are several options: \n' 77 | for c, content in option_dict.items(): 78 | if not pd.isna(content): 79 | s += f'{c}. {content}\n' 80 | return s 81 | 82 | 83 | def isimg(s): 84 | return osp.exists(s) or s.startswith('http') 85 | 86 | 87 | def read_ok(img_path): 88 | if not osp.exists(img_path): 89 | return False 90 | try: 91 | im = Image.open(img_path) 92 | assert im.size[0] > 0 and im.size[1] > 0 93 | return True 94 | except: 95 | return False 96 | 97 | 98 | def gpt_key_set(): 99 | openai_key = os.environ.get('OPENAI_API_KEY', None) 100 | return isinstance(openai_key, str) and openai_key.startswith('sk-') 101 | 102 | 103 | def apiok(wrapper): 104 | s = wrapper.generate('Hello!') 105 | return wrapper.fail_msg not in s 106 | 107 | 108 | def circular_pred(df, extract_func=None): 109 | if extract_func is None: 110 | extract_func = lambda x: x # noqa: E731 111 | df = df.sort_values('index') 112 | from vlmeval.utils import can_infer_option 113 | shift = int(1e6) 114 | 115 | choices = [extract_func(x) for x in df['prediction']] 116 | pred_map = {i: c for i, c in zip(df['index'], choices)} 117 | flag_map = {i: True for i in pred_map if i < 1e6} 118 | valid_map = {i: True for i in pred_map if i < 1e6} 119 | for i in df['index']: 120 | if i >= shift and pred_map[i] and pred_map[i - shift]: 121 | if ( 122 | pred_map[i] not in list(string.ascii_uppercase) or # noqa: W504 123 | pred_map[i - shift] not in list(string.ascii_uppercase) 124 | ): 125 | 126 | valid_map[i % shift] = False 127 | continue 128 | if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1: 129 | continue 130 | else: 131 | flag_map[i % shift] = False 132 | flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} 133 | flags = list(flag_map.values()) 134 | return np.mean(flags) 135 | -------------------------------------------------------------------------------- /vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text 2 | from .mp_util import track_progress_rich 3 | from .custom_prompt import CustomPrompt 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full 5 | from .dataset import TSVDataset, split_MMMU 6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer 7 | 8 | 9 | __all__ = [ 10 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 11 | 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt', 12 | 'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer' 13 | ] 14 | -------------------------------------------------------------------------------- /vlmeval/utils/custom_prompt.py: -------------------------------------------------------------------------------- 1 | from ..smp import * 2 | from .dataset_config import img_root_map 3 | from abc import abstractmethod 4 | 5 | 6 | class CustomPrompt: 7 | 8 | @abstractmethod 9 | def use_custom_prompt(self, dataset): 10 | raise NotImplementedError 11 | 12 | @abstractmethod 13 | def build_prompt(self, line, dataset): 14 | raise NotImplementedError 15 | 16 | def dump_image(self, line, dataset): 17 | ROOT = LMUDataRoot() 18 | assert isinstance(dataset, str) 19 | img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) 20 | os.makedirs(img_root, exist_ok=True) 21 | 22 | if 'image' in line: 23 | if isinstance(line['image'], list): 24 | tgt_path = [] 25 | assert 'image_path' in line 26 | for img, im_name in zip(line['image'], line['image_path']): 27 | path = osp.join(img_root, im_name) 28 | if not read_ok(path): 29 | decode_base64_to_image_file(img, path) 30 | tgt_path.append(path) 31 | else: 32 | tgt_path = osp.join(img_root, f"{line['index']}.jpg") 33 | if not read_ok(tgt_path): 34 | decode_base64_to_image_file(line['image'], tgt_path) 35 | tgt_path = [tgt_path] 36 | else: 37 | assert 'image_path' in line 38 | tgt_path = toliststr(line['image_path']) 39 | 40 | return tgt_path 41 | -------------------------------------------------------------------------------- /vlmeval/utils/matching_util.py: -------------------------------------------------------------------------------- 1 | import string 2 | import copy as cp 3 | import os 4 | from ..smp import * 5 | 6 | 7 | def can_infer_option(answer, choices): 8 | verbose = os.environ.get('VERBOSE', 0) 9 | # Choices is a dictionary 10 | if 'Failed to obtain answer via API' in answer: 11 | return False 12 | 13 | reject_to_answer = [ 14 | "Sorry, I can't help with images of people yet.", 15 | "I can't process this file.", 16 | "I'm sorry, but without the image provided", 17 | 'Cannot determine the answer' 18 | ] 19 | for err in reject_to_answer: 20 | if err in answer: 21 | return 'Z' 22 | 23 | def count_choice(splits, choices, prefix='', suffix=''): 24 | cnt = 0 25 | for c in choices: 26 | if prefix + c + suffix in splits: 27 | cnt += 1 28 | return cnt 29 | 30 | answer_mod = cp.copy(answer) 31 | chars = '.()[],:;!*#{}' 32 | for c in chars: 33 | answer_mod = answer_mod.replace(c, ' ') 34 | 35 | splits = [x.strip() for x in answer_mod.split()] 36 | count = count_choice(splits, choices) 37 | 38 | if count == 1: 39 | for ch in choices: 40 | if 'A' in splits and len(splits) > 3 and verbose: 41 | logger = get_logger('Evaluation') 42 | logger.info(f'A might be a quantifier in the string: {answer}.') 43 | return False 44 | if ch in splits: 45 | return ch 46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1: 47 | return 'Z' 48 | return False 49 | 50 | 51 | def can_infer_text(answer, choices): 52 | answer = answer.lower() 53 | assert isinstance(choices, dict) 54 | for k in choices: 55 | assert k in string.ascii_uppercase 56 | choices[k] = str(choices[k]).lower() 57 | cands = [] 58 | for k in choices: 59 | if choices[k] in answer: 60 | cands.append(k) 61 | if len(cands) == 1: 62 | return cands[0] 63 | return False 64 | 65 | 66 | def can_infer(answer, choices): 67 | answer = str(answer) 68 | copt = can_infer_option(answer, choices) 69 | return copt if copt else can_infer_text(answer, choices) 70 | -------------------------------------------------------------------------------- /vlmeval/utils/result_transfer.py: -------------------------------------------------------------------------------- 1 | from ..evaluate.misc import build_judge 2 | from ..evaluate.multiple_choice import extract_answer_from_item 3 | 4 | from ..smp import * 5 | from .matching_util import can_infer 6 | from .mp_util import track_progress_rich 7 | 8 | 9 | def MMMU_result_transfer(result_path): 10 | res = {} 11 | result_data = load(result_path) 12 | mcq = result_data['A'].notna() 13 | lt = len(result_data) 14 | for i in range(lt): 15 | line = result_data.iloc[i] 16 | if mcq[i]: 17 | options = { 18 | cand: line[cand] 19 | for cand in string.ascii_uppercase 20 | if cand in line and not pd.isna(line[cand]) 21 | } 22 | prediction = line['prediction'] 23 | infer_prediction = can_infer(prediction, options) 24 | res[line['id']] = infer_prediction 25 | else: 26 | res[line['id']] = line['prediction'] 27 | result_json = result_path.replace('.xlsx', '.json') 28 | dump(res, result_json) 29 | return result_json 30 | 31 | 32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): 33 | logger = get_logger('Evaluation') 34 | INTERNAL = os.environ.get('INTERNAL', 0) 35 | nproc = judge_kwargs.pop('nproc', 4) 36 | 37 | rd.seed(2680) 38 | suffix = eval_file.split('.')[-1] 39 | model = judge_kwargs['model'] 40 | assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] 41 | name_str_map = { 42 | 'chatgpt-0125': 'openai', 43 | 'gpt-4-0125': 'gpt4' 44 | } 45 | name_str = name_str_map[model] if model in name_str_map else model 46 | 47 | if model == 'exact_matching': 48 | model = None 49 | else: 50 | if INTERNAL or gpt_key_set(): 51 | model = build_judge(**judge_kwargs) 52 | else: 53 | logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') 54 | model = None 55 | 56 | logger.info(f'Evaluating {eval_file}') 57 | result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') 58 | result = {} 59 | if osp.exists(result_file): 60 | result = load(result_file) 61 | 62 | data = load(eval_file) 63 | assert 'index' in data, 'Essentail columns missing in the eval_file.' 64 | 65 | data = data.sort_values(by='index') 66 | data['prediction'] = [str(x) for x in data['prediction']] 67 | for k in data.keys(): 68 | data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) 69 | 70 | idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))} 71 | idx2lines = {k: v for k, v in idx2lines.items() if k not in result} 72 | 73 | indices = list(idx2lines.keys()) 74 | lines = [idx2lines[i] for i in indices] 75 | tups = [(model, line) for line in lines] 76 | res = track_progress_rich( 77 | extract_answer_from_item, 78 | tups, 79 | nproc=nproc, 80 | chunksize=nproc, 81 | save=result_file, 82 | keys=indices) 83 | 84 | for i, r in zip(indices, res): 85 | if i in result: 86 | assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log'] 87 | else: 88 | result[i] = r 89 | 90 | indices = list(data['index']) 91 | data['opt'] = [result[i]['opt'] for i in data['index']] 92 | data['log'] = [result[i]['log'] for i in data['index']] 93 | 94 | # load split 95 | output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') 96 | dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) 97 | return output_path 98 | -------------------------------------------------------------------------------- /vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .base import BaseModel 6 | from .cogvlm import CogVlm, GLM4v 7 | from .emu import Emu 8 | from .idefics import IDEFICS, IDEFICS2 9 | from .instructblip import InstructBLIP 10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner 11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V 12 | from .minigpt4 import MiniGPT4 13 | from .mmalaya import MMAlaya 14 | from .monkey import Monkey, MonkeyChat 15 | from .mplug_owl2 import mPLUG_Owl2 16 | from .omnilmm import OmniLMM12B 17 | from .open_flamingo import OpenFlamingo 18 | from .pandagpt import PandaGPT 19 | from .qwen_vl import QwenVL, QwenVLChat 20 | from .transcore_m import TransCoreM 21 | from .visualglm import VisualGLM 22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD 23 | from .yi_vl import Yi_VL 24 | from .internvl_chat import InternVLChat 25 | from .deepseek_vl import DeepSeekVL 26 | from .mgm import Mini_Gemini 27 | from .bunnyllama3 import BunnyLLama3 28 | from .vxverse import VXVERSE 29 | from .paligemma import PaliGemma 30 | from .qh_360vl import QH_360VL 31 | from .phi3_vision import Phi3Vision 32 | from .wemm import WeMM 33 | -------------------------------------------------------------------------------- /vlmeval/vlm/bunnyllama3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | from .base import BaseModel 8 | from ..smp import * 9 | from ..utils import DATASET_TYPE 10 | 11 | 12 | class BunnyLLama3(BaseModel): 13 | 14 | INSTALL_REQ = False 15 | INTERLEAVE = False 16 | 17 | def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs): 18 | assert model_path is not None 19 | transformers.logging.set_verbosity_error() 20 | transformers.logging.disable_progress_bar() 21 | warnings.filterwarnings('ignore') 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 23 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True) 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | text = f"A chat between a curious user and an artificial intelligence assistant. \ 29 | The assistant gives helpful, detailed, and polite answers to the user's questions. \ 30 | USER: \n{prompt} ASSISTANT:" 31 | text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')] 32 | input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0) 33 | image = Image.open(image_path).convert('RGB') 34 | image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype) 35 | 36 | output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0] 37 | response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True) 38 | return response 39 | -------------------------------------------------------------------------------- /vlmeval/vlm/deepseek_vl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from transformers import AutoModelForCausalLM 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class DeepSeekVL(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = True 12 | 13 | def check_install(self): 14 | try: 15 | import deepseek_vl 16 | except ImportError: 17 | warnings.warn( 18 | 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL') 19 | sys.exit(-1) 20 | 21 | def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs): 22 | self.check_install() 23 | assert model_path is not None 24 | self.model_path = model_path 25 | from deepseek_vl.models import VLChatProcessor 26 | 27 | self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) 28 | self.tokenizer = self.vl_chat_processor.tokenizer 29 | 30 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) 31 | self.model = model.to(torch.bfloat16).cuda().eval() 32 | 33 | torch.cuda.empty_cache() 34 | default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True) 35 | default_kwargs.update(kwargs) 36 | self.kwargs = default_kwargs 37 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 38 | 39 | def prepare_inputs(self, message): 40 | content, images = '', [] 41 | for s in message: 42 | if s['type'] == 'image': 43 | images.append(s['value']) 44 | content += '' 45 | elif s['type'] == 'text': 46 | content += s['value'] 47 | conversation = [ 48 | dict(role='User', content=content, images=images), 49 | dict(role='Assistant', content='') 50 | ] 51 | return conversation 52 | 53 | def generate_inner(self, message, dataset=None): 54 | conversation = self.prepare_inputs(message) 55 | from deepseek_vl.utils.io import load_pil_images 56 | pil_images = load_pil_images(conversation) 57 | prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True) 58 | prepare_inputs = prepare_inputs.to(self.model.device) 59 | inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs) 60 | 61 | outputs = self.model.language_model.generate( 62 | inputs_embeds=inputs_embeds, 63 | attention_mask=prepare_inputs.attention_mask, 64 | pad_token_id=self.tokenizer.eos_token_id, 65 | bos_token_id=self.tokenizer.bos_token_id, 66 | eos_token_id=self.tokenizer.eos_token_id, 67 | **self.kwargs) 68 | answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True) 69 | return answer 70 | -------------------------------------------------------------------------------- /vlmeval/vlm/emu.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class Emu(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, 15 | model_path='BAAI/Emu2-Chat', 16 | **kwargs): 17 | 18 | self.model_path = model_path 19 | assert osp.exists(model_path) or splitlen(model_path) == 2 20 | 21 | from transformers import AutoModelForCausalLM, AutoTokenizer 22 | from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model 23 | 24 | local_rank = os.environ.get('LOCAL_RANK', 0) 25 | 26 | device_num = torch.cuda.device_count() 27 | assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' 28 | assert device_num >= 2, 'You need at least 2 GPUs to use EMU' 29 | 30 | device_1 = local_rank 31 | device_2 = local_rank + device_num // 2 32 | 33 | torch.cuda.set_device(device_1) 34 | torch.cuda.set_device(device_2) 35 | 36 | tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat" 37 | self.tokenizer = tokenizer 38 | with init_empty_weights(): 39 | model = AutoModelForCausalLM.from_pretrained( 40 | model_path, # "BAAI/Emu2-Chat" 41 | torch_dtype=torch.bfloat16, 42 | low_cpu_mem_usage=True, 43 | trust_remote_code=True) 44 | 45 | device_map = infer_auto_device_map( 46 | model, 47 | max_memory={ 48 | device_1: '38GiB', 49 | device_2: '38GiB' 50 | }, 51 | no_split_module_classes=['Block', 'LlamaDecoderLayer']) 52 | 53 | # input and output logits should be on same device 54 | device_map['model.decoder.lm.lm_head'] = device_1 55 | 56 | model = dispatch_model( 57 | model, 58 | device_map=device_map).eval() 59 | 60 | self.model = model 61 | kwargs_default = dict(max_new_tokens=512, length_penalty=-1) 62 | kwargs_default.update(kwargs) 63 | self.kwargs = kwargs_default 64 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 65 | 66 | def generate_inner(self, message, dataset=None): 67 | query, images = '', [] 68 | for item in message: 69 | if item['type'] == 'image': 70 | images.append(Image.open(item['value']).convert('RGB')) 71 | query += '[]' 72 | elif item['type'] == 'text': 73 | query += item['value'] 74 | 75 | inputs = self.model.build_input_ids( 76 | text=[query], 77 | tokenizer=self.tokenizer, 78 | image=images 79 | ) 80 | 81 | with torch.no_grad(): 82 | outputs = self.model.generate( 83 | input_ids=inputs['input_ids'], 84 | attention_mask=inputs['attention_mask'], 85 | image=inputs['image'].to(torch.bfloat16), 86 | **self.kwargs) 87 | 88 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) 89 | return output_text[0] 90 | -------------------------------------------------------------------------------- /vlmeval/vlm/instructblip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import os.path as osp 4 | import sys 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class InstructBLIP(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name): 15 | self.config_map = { 16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml', 17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml', 18 | } 19 | 20 | self.file_path = __file__ 21 | config_root = osp.dirname(self.file_path) 22 | 23 | try: 24 | from lavis.models import load_preprocess 25 | from omegaconf import OmegaConf 26 | from lavis.common.registry import registry 27 | except: 28 | warnings.warn('Please install lavis before using InstructBLIP. ') 29 | sys.exit(-1) 30 | 31 | assert name in self.config_map 32 | cfg_path = osp.join(config_root, self.config_map[name]) 33 | cfg = OmegaConf.load(cfg_path) 34 | 35 | model_cfg = cfg.model 36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2 37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct') 38 | model = model_cls.from_config(model_cfg) 39 | model.eval() 40 | 41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 42 | device = self.device 43 | model.to(device) 44 | self.model = model 45 | self.kwargs = {'max_length': 512} 46 | 47 | preprocess_cfg = cfg.preprocess 48 | vis_processors, _ = load_preprocess(preprocess_cfg) 49 | self.vis_processors = vis_processors 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message) 53 | vis_processors = self.vis_processors 54 | raw_image = Image.open(image_path).convert('RGB') 55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device) 56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt)) 57 | return outputs[0] 58 | -------------------------------------------------------------------------------- /vlmeval/vlm/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava import LLaVA, LLaVA_Next 2 | from .llava_xtuner import LLaVA_XTuner 3 | 4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner'] 5 | -------------------------------------------------------------------------------- /vlmeval/vlm/minigpt4.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import os.path as osp 4 | import warnings 5 | from transformers import StoppingCriteriaList 6 | from .base import BaseModel 7 | 8 | 9 | class MiniGPT4(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, 15 | mode='v2', 16 | root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/', 17 | temperature=1, 18 | max_out_len=512): 19 | 20 | if root is None: 21 | warnings.warn( 22 | 'Please set root to the directory of MiniGPT-4, which is cloned from here: ' 23 | 'https://github.com/Vision-CAIR/MiniGPT-4. ' 24 | ) 25 | 26 | if mode == 'v2': 27 | cfg = 'minigptv2_eval.yaml' 28 | elif mode == 'v1_7b': 29 | cfg = 'minigpt4_7b_eval.yaml' 30 | elif mode == 'v1_13b': 31 | cfg = 'minigpt4_13b_eval.yaml' 32 | else: 33 | raise NotImplementedError 34 | 35 | self.mode = mode 36 | self.temperature = temperature 37 | self.max_out_len = max_out_len 38 | self.root = root 39 | this_dir = osp.dirname(__file__) 40 | 41 | self.cfg = osp.join(this_dir, 'misc', cfg) 42 | sys.path.append(self.root) 43 | 44 | from omegaconf import OmegaConf 45 | from minigpt4.common.registry import registry 46 | from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2 47 | 48 | device = torch.cuda.current_device() 49 | self.device = device 50 | 51 | cfg_path = self.cfg 52 | cfg = OmegaConf.load(cfg_path) 53 | 54 | model_cfg = cfg.model 55 | model_cfg.device_8bit = device 56 | model_cls = registry.get_model_class(model_cfg.arch) 57 | model = model_cls.from_config(model_cfg) 58 | model = model.to(device) 59 | model.eval() 60 | vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train 61 | vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg) 62 | self.model = model 63 | self.vis_processor = vis_processor 64 | 65 | self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0 66 | stop_words_ids = [[835], [2277, 29937]] 67 | stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids] 68 | self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 69 | 70 | def generate_inner(self, message, dataset=None): 71 | from minigpt4.conversation.conversation import Chat 72 | prompt, image_path = self.message_to_promptimg(message) 73 | if self.mode == 'v2': 74 | chat = Chat(self.model, self.vis_processor, device=self.device) 75 | else: 76 | chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria) 77 | 78 | chat_state = self.CONV_VISION.copy() 79 | img_list = [] 80 | _ = chat.upload_img(image_path, chat_state, img_list) 81 | chat.encode_img(img_list) 82 | chat.ask(prompt, chat_state) 83 | with torch.inference_mode(): 84 | msg = chat.answer(conv=chat_state, img_list=img_list)[0] 85 | return msg 86 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigpt4_13b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-13b-v0" 25 | 26 | datasets: 27 | cc_sbu_align: 28 | vis_processor: 29 | train: 30 | name: "blip2_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | 36 | run: 37 | task: image_text_pretrain 38 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigpt4_7b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-7b-v0" 25 | 26 | 27 | datasets: 28 | cc_sbu_align: 29 | vis_processor: 30 | train: 31 | name: "blip2_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | 37 | run: 38 | task: image_text_pretrain 39 | -------------------------------------------------------------------------------- /vlmeval/vlm/misc/minigptv2_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt_v2 3 | model_type: pretrain 4 | max_txt_len: 160 5 | end_sym: "" 6 | low_resource: True 7 | prompt_template: '[INST] {} [/INST]' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | lora_r: 64 10 | lora_alpha: 16 11 | 12 | # vit encoder 13 | image_size: 448 14 | drop_path_rate: 0 15 | use_grad_checkpoint: False 16 | vit_precision: "fp16" 17 | freeze_vit: True 18 | 19 | # generation configs 20 | prompt: "" 21 | 22 | # LLM 23 | llama_model: "please set this value to the path of llama2-chat-7b" 24 | 25 | datasets: 26 | cc_sbu_align: 27 | vis_processor: 28 | train: 29 | name: "blip2_image_eval" 30 | image_size: 448 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | 35 | run: 36 | task: image_text_pretrain 37 | -------------------------------------------------------------------------------- /vlmeval/vlm/mmalaya.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | from PIL import Image 5 | from .base import BaseModel 6 | 7 | 8 | class MMAlaya(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs): 14 | assert model_path is not None 15 | self.model_path = model_path 16 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 17 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval() 18 | # need initialize tokenizer 19 | model.initialize_tokenizer(self.tokenizer) 20 | self.model = model.cuda() 21 | 22 | self.kwargs = kwargs 23 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 24 | torch.cuda.empty_cache() 25 | 26 | def generate_inner(self, message, dataset=None): 27 | # read image 28 | prompt, image_path = self.message_to_promptimg(message) 29 | image = Image.open(image_path).convert('RGB') 30 | # tokenize prompt, and proprecess image 31 | input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference( 32 | prompt, 33 | self.tokenizer, 34 | image, 35 | return_tensors='pt') 36 | with torch.inference_mode(): 37 | output_ids = self.model.generate( 38 | inputs=input_ids.cuda(), 39 | images=image_tensor.cuda(), 40 | do_sample=False, 41 | max_new_tokens=512, 42 | num_beams=1, 43 | use_cache=True, 44 | stopping_criteria=[stopping_criteria], 45 | ) 46 | # truncate input_ids in generate_ids and then decode to text 47 | input_token_len = input_ids.shape[1] 48 | response = self.tokenizer.batch_decode( 49 | output_ids[:, input_token_len:].cpu(), 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0].strip() 53 | return response 54 | 55 | 56 | if __name__ == '__main__': 57 | model = MMAlaya() 58 | response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。']) 59 | print(response) 60 | 61 | """ 62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit 63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py 64 | """ 65 | -------------------------------------------------------------------------------- /vlmeval/vlm/open_flamingo.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from PIL import Image 4 | import os.path as osp 5 | import warnings 6 | from .base import BaseModel 7 | from ..smp import splitlen, get_cache_path 8 | from huggingface_hub import snapshot_download 9 | 10 | 11 | class OpenFlamingo(BaseModel): 12 | 13 | INSTALL_REQ = True 14 | INTERLEAVE = True 15 | 16 | def __init__(self, 17 | name, 18 | mpt_pth=None, 19 | ckpt_pth=None, 20 | **kwargs): 21 | 22 | if mpt_pth is None: 23 | warnings.warn( 24 | 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: ' 25 | 'https://huggingface.co/mosaicml/mpt-7b. ' 26 | ) 27 | sys.exit(-1) 28 | if ckpt_pth is None: 29 | warnings.warn( 30 | 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded ' 31 | 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. ' 32 | ) 33 | sys.exit(-1) 34 | else: 35 | if osp.exists(ckpt_pth): 36 | if ckpt_pth.endswith('checkpoint.pt'): 37 | pass 38 | elif osp.isdir(ckpt_pth): 39 | ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt') 40 | if not osp.exists(ckpt_pth): 41 | sys.exit(-1) 42 | elif splitlen(ckpt_pth, '/') == 2: 43 | cache_path = get_cache_path(ckpt_pth) 44 | if cache_path is None: 45 | snapshot_download(ckpt_pth) 46 | cache_path = get_cache_path(ckpt_pth) 47 | if cache_path is None: 48 | sys.exit(-1) 49 | else: 50 | ckpt_pth = osp.join(cache_path, 'checkpoint.pt') 51 | 52 | self.name = name 53 | assert name in ['v2'] 54 | self.mpt_pth = mpt_pth 55 | try: 56 | from open_flamingo import create_model_and_transforms 57 | except: 58 | raise ImportError('Please first install open_flamingo to use OpenFlamingo') 59 | model, image_processor, tokenizer = create_model_and_transforms( 60 | clip_vision_encoder_path='ViT-L-14', 61 | clip_vision_encoder_pretrained='openai', 62 | lang_encoder_path=mpt_pth, 63 | tokenizer_path=mpt_pth, 64 | cross_attn_every_n_layers=4) 65 | ckpt = torch.load(ckpt_pth) 66 | model.load_state_dict(ckpt, strict=False) 67 | torch.cuda.empty_cache() 68 | self.model = model.eval().cuda() 69 | self.tokenizer = tokenizer 70 | self.tokenizer.padding_side = 'left' 71 | self.image_proc = image_processor 72 | 73 | kwargs_default = dict(max_new_tokens=512, num_beams=3) 74 | kwargs_default.update(kwargs) 75 | self.kwargs = kwargs_default 76 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 77 | 78 | def generate_inner(self, message, dataset=None): 79 | vision_x = [] 80 | prompt = '' 81 | for msg in message: 82 | if msg['type'] == 'image': 83 | img = Image.open(msg['value']) 84 | vision_x.append(self.image_proc(img).unsqueeze(0)) 85 | prompt += '' 86 | elif msg['type'] == 'text': 87 | prompt += msg['value'] 88 | prompt += 'Answer: ' 89 | vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0] 90 | vision_x = vision_x.unsqueeze(1).unsqueeze(0) 91 | lang_x = self.tokenizer([prompt], return_tensors='pt') 92 | generated_text = self.model.generate( 93 | vision_x=vision_x.cuda(), 94 | lang_x=lang_x['input_ids'].cuda(), 95 | attention_mask=lang_x['attention_mask'].cuda(), 96 | **self.kwargs) 97 | generated_text = self.tokenizer.decode(generated_text[0]) 98 | text = generated_text[len(prompt):].split('<|endofchunk|>')[0] 99 | return text 100 | -------------------------------------------------------------------------------- /vlmeval/vlm/paligemma.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class PaliGemma(BaseModel): 9 | INSTALL_REQ = False 10 | INTERLEAVE = False 11 | 12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs): 13 | try: 14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration 15 | except: 16 | warnings.warn('Please install the latest version transformers.') 17 | sys.exit(-1) 18 | model = PaliGemmaForConditionalGeneration.from_pretrained( 19 | model_path, 20 | torch_dtype=torch.bfloat16, 21 | device_map='cpu', 22 | revision='bfloat16', 23 | ).eval() 24 | self.model = model.cuda() 25 | self.processor = AutoProcessor.from_pretrained(model_path) 26 | self.kwargs = kwargs 27 | 28 | def generate_inner(self, message, dataset=None): 29 | prompt, image_path = self.message_to_promptimg(message) 30 | image = Image.open(image_path).convert('RGB') 31 | 32 | model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda') 33 | input_len = model_inputs['input_ids'].shape[-1] 34 | 35 | with torch.inference_mode(): 36 | generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False) 37 | generation = generation[0][input_len:] 38 | res = self.processor.decode(generation, skip_special_tokens=True) 39 | return res 40 | -------------------------------------------------------------------------------- /vlmeval/vlm/pandagpt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | import os.path as osp 4 | import warnings 5 | from .base import BaseModel 6 | 7 | 8 | class PandaGPT(BaseModel): 9 | 10 | INSTALL_REQ = True 11 | INTERLEAVE = False 12 | 13 | def __init__(self, name, root=None, **kwargs): 14 | if root is None: 15 | warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ') 16 | sys.exit(-1) 17 | 18 | assert name == 'PandaGPT_13B' 19 | self.name = name 20 | sys.path.append(osp.join(root, 'code')) 21 | try: 22 | from model.openllama import OpenLLAMAPEFTModel 23 | except: 24 | raise ImportError( 25 | 'Please first install PandaGPT and set the root path to use PandaGPT, ' 26 | 'which is cloned from here: https://github.com/yxuansu/PandaGPT. ' 27 | ) 28 | self.args = { 29 | 'model': 'openllama_peft', 30 | 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'), 31 | 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'), 32 | 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'), 33 | 'stage': 2, 34 | 'max_tgt_len': 512, 35 | 'lora_r': 32, 36 | 'lora_alpha': 32, 37 | 'lora_dropout': 0.1, 38 | } 39 | model = OpenLLAMAPEFTModel(**self.args) 40 | delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) 41 | model.load_state_dict(delta_ckpt, strict=False) 42 | torch.cuda.empty_cache() 43 | self.model = model.eval().half().cuda() 44 | kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} 45 | kwargs_default.update(kwargs) 46 | self.kwargs = kwargs_default 47 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 48 | 49 | def generate_inner(self, message, dataset=None): 50 | prompt, image_path = self.message_to_promptimg(message) 51 | struct = { 52 | 'prompt': prompt, 53 | 'image_paths': [image_path], 54 | 'audio_paths': [], 55 | 'video_paths': [], 56 | 'thermal_paths': [], 57 | 'modality_embeds': [] 58 | } 59 | struct.update(self.kwargs) 60 | resp = self.model.generate(struct) 61 | return resp 62 | -------------------------------------------------------------------------------- /vlmeval/vlm/phi3_vision.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class Phi3Vision(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs): 14 | try: 15 | from transformers import AutoProcessor, AutoModelForCausalLM 16 | except: 17 | warnings.warn('Please install the latest version transformers.') 18 | sys.exit(-1) 19 | model = AutoModelForCausalLM.from_pretrained( 20 | model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval() 21 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 22 | self.model = model 23 | self.processor = processor 24 | self.kwargs = kwargs 25 | 26 | def generate_inner(self, message, dataset=None): 27 | prompt, image_path = self.message_to_promptimg(message) 28 | image = Image.open(image_path).convert('RGB') 29 | messages = [ 30 | {'role': 'user', 'content': f'<|image_1|>\n{prompt}'} 31 | ] 32 | prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 33 | inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda') 34 | 35 | generation_args = { 36 | 'max_new_tokens': 500, 37 | 'temperature': 0.0, 38 | 'do_sample': False, 39 | } 40 | generation_args.update(self.kwargs) 41 | 42 | generate_ids = self.model.generate( 43 | **inputs, 44 | eos_token_id=self.processor.tokenizer.eos_token_id, 45 | **generation_args 46 | ) 47 | generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] 48 | response = self.processor.batch_decode( 49 | generate_ids, 50 | skip_special_tokens=True, 51 | clean_up_tokenization_spaces=False 52 | )[0] 53 | return response 54 | -------------------------------------------------------------------------------- /vlmeval/vlm/qh_360vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import os.path as osp 5 | from PIL import Image 6 | from .base import BaseModel 7 | from ..smp import * 8 | from ..utils import DATASET_TYPE 9 | 10 | 11 | class QH_360VL(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='qihoo360/360VL-70B', **kwargs): 17 | assert model_path is not None 18 | self.model_path = model_path 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 20 | self.model = AutoModelForCausalLM.from_pretrained(model_path, 21 | torch_dtype=torch.float16, 22 | low_cpu_mem_usage=True, 23 | device_map='auto', 24 | trust_remote_code=True).eval() 25 | vision_tower = self.model.get_vision_tower() 26 | vision_tower.load_model() 27 | vision_tower.to(device='cuda', dtype=torch.float16) 28 | self.image_processor = vision_tower.image_processor 29 | self.tokenizer.pad_token = self.tokenizer.eos_token 30 | self.kwargs = kwargs 31 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 32 | torch.cuda.empty_cache() 33 | 34 | def generate(self, message, dataset=None): 35 | 36 | prompt, image_path = self.message_to_promptimg(message) 37 | print(prompt) 38 | image = Image.open(image_path).convert('RGB') 39 | terminators = [ 40 | self.tokenizer.convert_tokens_to_ids('<|eot_id|>',) 41 | ] 42 | inputs = self.model.build_conversation_input_ids(self.tokenizer, 43 | query=prompt, 44 | image=image, 45 | image_processor=self.image_processor) 46 | input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True) 47 | images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True) 48 | 49 | output_ids = self.model.generate(input_ids=input_ids, 50 | images=images, 51 | do_sample=False, 52 | num_beams=1, 53 | max_new_tokens=512, 54 | eos_token_id=terminators, 55 | use_cache=True) 56 | 57 | input_token_len = input_ids.shape[1] 58 | outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 59 | response = outputs.strip() 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /vlmeval/vlm/qwen_vl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import warnings 4 | import copy as cp 5 | from .base import BaseModel 6 | from ..smp import isimg, listinstr 7 | from ..utils import DATASET_TYPE 8 | 9 | 10 | class QwenVL(BaseModel): 11 | 12 | INSTALL_REQ = False 13 | INTERLEAVE = True 14 | 15 | def __init__(self, model_path='Qwen/Qwen-VL', **kwargs): 16 | assert model_path is not None 17 | self.model_path = model_path 18 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 19 | tokenizer.padding_side = 'left' 20 | tokenizer.pad_token_id = tokenizer.eod_id 21 | self.tokenizer = tokenizer 22 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 23 | default_kwargs = dict( 24 | do_sample=False, 25 | num_beams=1, 26 | max_new_tokens=512, 27 | min_new_tokens=1, 28 | num_return_sequences=1, 29 | use_cache=True, 30 | output_hidden_states=True, 31 | pad_token_id=tokenizer.eod_id, 32 | eos_token_id=tokenizer.eod_id) 33 | default_kwargs.update(kwargs) 34 | self.kwargs = default_kwargs 35 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 36 | torch.cuda.empty_cache() 37 | 38 | def adjust_kwargs(self, dataset): 39 | kwargs = cp.deepcopy(self.kwargs) 40 | if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']: 41 | kwargs['max_new_tokens'] = 32 42 | elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset: 43 | kwargs['max_new_tokens'] = 32 44 | elif DATASET_TYPE(dataset) == 'VQA': 45 | if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset): 46 | kwargs['max_new_tokens'] = 100 47 | elif listinstr(['TextVQA'], dataset): 48 | kwargs['max_new_tokens'] = 10 49 | return kwargs 50 | 51 | def generate_inner(self, message, dataset=None): 52 | if dataset is not None: 53 | kwargs = self.adjust_kwargs(dataset) 54 | else: 55 | kwargs = self.kwargs 56 | prompt = '' 57 | for s in message: 58 | if s['type'] == 'image': 59 | prompt += f'{s["value"]}' 60 | elif s['type'] == 'text': 61 | prompt += s['value'] 62 | if dataset is not None and DATASET_TYPE(dataset) == 'VQA': 63 | prompt += ' Answer:' 64 | encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest') 65 | input_ids = encoded.input_ids.to('cuda') 66 | attention_mask = encoded.attention_mask.to('cuda') 67 | 68 | pred = self.model.generate( 69 | input_ids=input_ids, 70 | attention_mask=attention_mask, 71 | **kwargs) 72 | answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip() 73 | return answer 74 | 75 | 76 | class QwenVLChat(BaseModel): 77 | 78 | INSTALL_REQ = False 79 | INTERLEAVE = True 80 | 81 | def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs): 82 | assert model_path is not None 83 | self.model_path = model_path 84 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 85 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval() 86 | torch.cuda.empty_cache() 87 | self.kwargs = kwargs 88 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 89 | 90 | def generate_inner(self, message, dataset=None): 91 | vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message] 92 | query = self.tokenizer.from_list_format(vl_list) 93 | response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs) 94 | return response 95 | -------------------------------------------------------------------------------- /vlmeval/vlm/visualglm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel 3 | from ..smp import * 4 | 5 | 6 | class VisualGLM(BaseModel): 7 | 8 | INSTALL_REQ = False 9 | INTERLEAVE = False 10 | 11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs): 12 | try: 13 | import sat 14 | except: 15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM') 16 | assert model_path is not None 17 | self.model_path = model_path 18 | 19 | from transformers import AutoModel 20 | from transformers import AutoTokenizer 21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() 23 | self.model = model 24 | self.kwargs = kwargs 25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 26 | 27 | def generate_inner(self, message, dataset=None): 28 | prompt, image_path = self.message_to_promptimg(message) 29 | output, _ = self.model.chat( 30 | image_path=image_path, 31 | tokenizer=self.tokenizer, 32 | query=prompt, 33 | history=[], 34 | **self.kwargs 35 | ) 36 | return output 37 | -------------------------------------------------------------------------------- /vlmeval/vlm/wemm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import sys 4 | from ..smp import * 5 | from .base import BaseModel 6 | from ..utils import DATASET_TYPE 7 | from transformers import AutoModel, GenerationConfig 8 | 9 | 10 | class WeMM(BaseModel): 11 | def __init__(self, model_path='feipengma/WeMM', **kwargs): 12 | self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True) 13 | self.wemm.cuda() 14 | self.wemm.eval() 15 | torch.cuda.empty_cache() 16 | 17 | def use_custom_prompt(self, dataset): 18 | assert dataset is not None 19 | if DATASET_TYPE(dataset) == 'multi-choice': 20 | return True 21 | return False 22 | 23 | def build_prompt(self, line, dataset=None): 24 | assert self.use_custom_prompt(dataset) 25 | assert dataset is None or isinstance(dataset, str) 26 | tgt_path = self.dump_image(line, dataset) 27 | question = line['question'] 28 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 29 | if hint is not None: 30 | question = hint + '\n' + question 31 | options = { 32 | cand: line[cand] 33 | for cand in string.ascii_uppercase 34 | if cand in line and not pd.isna(line[cand]) 35 | } 36 | for key, item in options.items(): 37 | question += f'\n{key}. {item}' 38 | prompt = question 39 | 40 | if len(options): 41 | prompt += ( 42 | '\n请直接回答选项字母。' if cn_string(prompt) else 43 | "\nAnswer with the option's letter from the given choices directly." 44 | ) 45 | else: 46 | prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' 47 | 48 | message = [dict(type='text', value=prompt)] 49 | message.extend([dict(type='image', value=p) for p in tgt_path]) 50 | return message 51 | 52 | def generate_inner(self, message, dataset=None): 53 | prompt, image_path = self.message_to_promptimg(message) 54 | 55 | if dataset == 'HallusionBench': 56 | prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.' 57 | 58 | gen_config = None 59 | if dataset == 'MMVet': 60 | gen_config = GenerationConfig( 61 | max_new_tokens=512, 62 | do_sample=True, 63 | temperatures=0.7, 64 | num_beams=3, 65 | eos_token_id=self.wemm.tokenizer.eos_token_id, 66 | pad_token_id=self.wemm.tokenizer.pad_token_id 67 | if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id, 68 | ) 69 | pred = self.wemm.mm_generate(image_path, prompt, gen_config) 70 | 71 | return pred 72 | -------------------------------------------------------------------------------- /vlmeval/vlm/xcomposer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sharecaptioner import ShareCaptioner 2 | from .xcomposer import XComposer 3 | from .xcomposer2 import XComposer2 4 | from .xcomposer2_4KHD import XComposer2_4KHD 5 | 6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD'] 7 | -------------------------------------------------------------------------------- /vlmeval/vlm/xcomposer/sharecaptioner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from ..base import BaseModel 4 | from ...smp import * 5 | from ...utils import DATASET_TYPE 6 | 7 | 8 | class ShareCaptioner(BaseModel): 9 | 10 | INSTALL_REQ = False 11 | INTERLEAVE = False 12 | 13 | def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs): 14 | assert model_path is not None 15 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 16 | self.model = AutoModelForCausalLM.from_pretrained( 17 | model_path, device_map='cuda', trust_remote_code=True).eval() 18 | self.model.tokenizer = tokenizer 19 | self.model.cuda() 20 | self.model.half() 21 | 22 | def use_custom_prompt(self, dataset): 23 | assert dataset is not None 24 | if DATASET_TYPE(dataset) == 'multi-choice': 25 | return True 26 | return False 27 | 28 | def build_prompt(self, line, dataset=None): 29 | assert dataset is None or isinstance(dataset, str) 30 | assert self.use_custom_prompt(dataset) 31 | tgt_path = self.dump_image(line, dataset) 32 | 33 | if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice': 34 | question = line['question'] 35 | hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None 36 | if hint is not None: 37 | question = hint + '\n' + question 38 | 39 | option_candidate = string.ascii_uppercase 40 | options = { 41 | cand: line[cand] 42 | for cand in option_candidate 43 | if cand in line and not pd.isna(line[cand]) 44 | } 45 | for key, item in options.items(): 46 | question += f'\n{key}. {item}' 47 | prompt = question 48 | 49 | if not cn_string(prompt): 50 | prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." 51 | else: 52 | prompt = prompt + '\n' + '请直接回答选项字母。' 53 | else: 54 | prompt = line['question'] 55 | message = [dict(type='text', value=prompt)] 56 | message.extend([dict(type='image', value=s) for s in tgt_path]) 57 | return message 58 | 59 | def generate_inner(self, message, dataset=None): 60 | prompt, image_path = self.message_to_promptimg(message) 61 | seg1 = '<|User|>:' 62 | seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:' 63 | self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True) 64 | self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False) 65 | 66 | image = Image.open(image_path).convert('RGB') 67 | image = self.model.vis_processor(image).unsqueeze(0) 68 | image = image.to(self.model.device) 69 | tmp_bs = image.shape[0] 70 | tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1) 71 | tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1) 72 | with torch.cuda.amp.autocast(): 73 | with torch.no_grad(): 74 | image = self.model.encode_img(image) 75 | input_emb = torch.cat( 76 | [tmp_seg_emb1, image, tmp_seg_emb2], dim=1) 77 | out_embeds = self.model.internlm_model.generate( 78 | inputs_embeds=input_emb, 79 | max_length=500, 80 | num_beams=3, 81 | min_length=1, 82 | do_sample=True, 83 | repetition_penalty=1.5, 84 | length_penalty=1.0, 85 | temperature=1., 86 | eos_token_id=self.model.tokenizer.eos_token_id, 87 | num_return_sequences=1) 88 | 89 | for j, out in enumerate(out_embeds): 90 | out[out == -1] = 2 91 | response = self.model.decode_text([out]) 92 | return response 93 | --------------------------------------------------------------------------------