├── .github
    └── workflows
    │   └── lint.yml
├── .gitignore
├── .vscode
    └── launch.json
├── Quickstart.md
├── README.md
├── VLMEvalKit-main
    ├── .github
    │   └── workflows
    │   │   └── lint.yml
    ├── .gitignore
    ├── .pre-commit-config.yaml
    ├── LICENSE
    ├── README.md
    ├── assets
    │   ├── LOGO.svg
    │   └── apple.jpg
    ├── docs
    │   ├── en
    │   │   ├── Development.md
    │   │   └── Quickstart.md
    │   ├── ja
    │   │   └── README_ja.md
    │   └── zh-CN
    │   │   ├── Development_zh-CN.md
    │   │   ├── Quickstart_zh-CN.md
    │   │   └── README_zh-CN.md
    ├── requirements.txt
    ├── run.py
    ├── scripts
    │   ├── AI2D_preproc.ipynb
    │   ├── apires_scan.py
    │   ├── auto_run.py
    │   ├── cover.sh
    │   ├── mmb_eval_gradio.py
    │   ├── run.sh
    │   ├── srun.sh
    │   ├── summarize.py
    │   └── visualize.ipynb
    ├── setup.py
    └── vlmeval
    │   ├── __init__.py
    │   ├── api
    │       ├── __init__.py
    │       ├── base.py
    │       ├── claude.py
    │       ├── cloudwalk.py
    │       ├── gemini.py
    │       ├── glm_vision.py
    │       ├── gpt.py
    │       ├── gpt_int.py
    │       ├── hf_chat_model.py
    │       ├── qwen_api.py
    │       ├── qwen_vl_api.py
    │       ├── reka.py
    │       └── stepai.py
    │   ├── config.py
    │   ├── evaluate
    │       ├── OCRBench.py
    │       ├── __init__.py
    │       ├── coco_eval.py
    │       ├── llavabench.py
    │       ├── mathvista_eval.py
    │       ├── misc.py
    │       ├── mmvet_eval.py
    │       ├── multiple_choice.py
    │       ├── vqa_eval.py
    │       └── yes_or_no.py
    │   ├── inference.py
    │   ├── smp
    │       ├── __init__.py
    │       ├── file.py
    │       ├── log.py
    │       ├── misc.py
    │       └── vlm.py
    │   ├── tools.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── custom_prompt.py
    │       ├── dataset.py
    │       ├── dataset_config.py
    │       ├── matching_util.py
    │       ├── mp_util.py
    │       └── result_transfer.py
    │   └── vlm
    │       ├── __init__.py
    │       ├── base.py
    │       ├── bunnyllama3.py
    │       ├── cogvlm.py
    │       ├── deepseek_vl.py
    │       ├── emu.py
    │       ├── idefics.py
    │       ├── instructblip.py
    │       ├── internvl_chat.py
    │       ├── llava
    │           ├── __init__.py
    │           ├── llava.py
    │           └── llava_xtuner.py
    │       ├── mgm.py
    │       ├── minicpm_v.py
    │       ├── minigpt4.py
    │       ├── misc
    │           ├── blip2_instruct_vicuna13b.yaml
    │           ├── blip2_instruct_vicuna7b.yaml
    │           ├── minigpt4_13b_eval.yaml
    │           ├── minigpt4_7b_eval.yaml
    │           └── minigptv2_eval.yaml
    │       ├── mmalaya.py
    │       ├── monkey.py
    │       ├── mplug_owl2.py
    │       ├── omnilmm.py
    │       ├── open_flamingo.py
    │       ├── paligemma.py
    │       ├── pandagpt.py
    │       ├── phi3_vision.py
    │       ├── qh_360vl.py
    │       ├── qwen_vl.py
    │       ├── transcore_m.py
    │       ├── visualglm.py
    │       ├── vxverse.py
    │       ├── wemm.py
    │       ├── xcomposer
    │           ├── __init__.py
    │           ├── sharecaptioner.py
    │           ├── xcomposer.py
    │           ├── xcomposer2.py
    │           └── xcomposer2_4KHD.py
    │       └── yi_vl.py
├── assets
    ├── LOGO.svg
    ├── apple.jpg
    ├── metatask_eval.png
    ├── overall_progress.png
    └── overview.jpg
├── requirements.txt
├── run.py
├── setup.py
└── vlmeval
    ├── __init__.py
    ├── api
        ├── __init__.py
        ├── base.py
        ├── claude.py
        ├── cloudwalk.py
        ├── gemini.py
        ├── glm_vision.py
        ├── gpt.py
        ├── gpt_int.py
        ├── hf_chat_model.py
        ├── qwen_api.py
        ├── qwen_vl_api.py
        ├── reka.py
        └── stepai.py
    ├── config.py
    ├── evaluate
        ├── OCRBench.py
        ├── __init__.py
        ├── coco_eval.py
        ├── llavabench.py
        ├── mathvista_eval.py
        ├── misc.py
        ├── mmvet_eval.py
        ├── multiple_choice.py
        ├── vqa_eval.py
        └── yes_or_no.py
    ├── inference.py
    ├── smp
        ├── __init__.py
        ├── file.py
        ├── log.py
        ├── misc.py
        └── vlm.py
    ├── tools.py
    ├── utils
        ├── __init__.py
        ├── custom_prompt.py
        ├── dataset.py
        ├── dataset_config.py
        ├── matching_util.py
        ├── mp_util.py
        └── result_transfer.py
    └── vlm
        ├── __init__.py
        ├── base.py
        ├── bunnyllama3.py
        ├── cogvlm.py
        ├── deepseek_vl.py
        ├── emu.py
        ├── idefics.py
        ├── instructblip.py
        ├── internvl_chat.py
        ├── llava
            ├── __init__.py
            ├── llava.py
            └── llava_xtuner.py
        ├── mgm.py
        ├── minicpm_v.py
        ├── minigpt4.py
        ├── misc
            ├── blip2_instruct_vicuna13b.yaml
            ├── blip2_instruct_vicuna7b.yaml
            ├── minigpt4_13b_eval.yaml
            ├── minigpt4_7b_eval.yaml
            └── minigptv2_eval.yaml
        ├── mmalaya.py
        ├── monkey.py
        ├── mplug_owl2.py
        ├── omnilmm.py
        ├── open_flamingo.py
        ├── paligemma.py
        ├── pandagpt.py
        ├── phi3_vision.py
        ├── qh_360vl.py
        ├── qwen_vl.py
        ├── transcore_m.py
        ├── visualglm.py
        ├── vxverse.py
        ├── wemm.py
        ├── xcomposer
            ├── __init__.py
            ├── sharecaptioner.py
            ├── xcomposer.py
            ├── xcomposer2.py
            └── xcomposer2_4KHD.py
        └── yi_vl.py


/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.7
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.7
18 |       - name: Install pre-commit hook
19 |         run: |
20 |           pip install pre-commit
21 |           pre-commit install
22 |       - name: Linting
23 |         run: pre-commit run --all-files
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # Images
156 | images/
157 | 
158 | scripts/*ttf
159 | 
160 | lvlm_zoo/
161 | LMUData/
162 | work_dirs
163 | 
164 | batchscript-*
165 | phoenix-slurm-*
166 | LMUData
167 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python Debugger: Current File",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal"
13 |         },
14 |         {
15 |             "name": "main",
16 |             "type": "python",
17 |             "request": "launch",
18 |             "program": "run.py",
19 |             "console": "integratedTerminal",
20 |             "justMyCode": true,
21 |             "args": [
22 |                 "--data", "MMT-Bench_ALL",
23 |                 "--model", "llava_v1.5_7b",
24 |                 "--work-dir", "work_dirs/mmtbench"
25 |             ]
26 |         },
27 |     ]
28 | }


--------------------------------------------------------------------------------
/VLMEvalKit-main/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Set up Python 3.7
15 |         uses: actions/setup-python@v2
16 |         with:
17 |           python-version: 3.7
18 |       - name: Install pre-commit hook
19 |         run: |
20 |           pip install pre-commit
21 |           pre-commit install
22 |       - name: Linting
23 |         run: pre-commit run --all-files
24 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # Images
156 | images/
157 | 
158 | scripts/*ttf
159 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: |
 2 |   (?x)^(
 3 |       scripts/|
 4 |       assets/|
 5 |       vlmeval/config.py
 6 |   )
 7 | repos:
 8 |   - repo: https://github.com/PyCQA/flake8
 9 |     rev: 5.0.4
10 |     hooks:
11 |       - id: flake8
12 |         args: ["--max-line-length=120", "--ignore=F401,F403,F405,E402,E722,E741,W503"]
13 |         exclude: ^configs/
14 |   - repo: https://github.com/pre-commit/mirrors-yapf
15 |     rev: v0.30.0
16 |     hooks:
17 |       - id: yapf
18 |         args: ["--style={column_limit=120}"]
19 |   - repo: https://github.com/pre-commit/pre-commit-hooks
20 |     rev: v3.1.0
21 |     hooks:
22 |       - id: trailing-whitespace
23 |       - id: check-yaml
24 |       - id: end-of-file-fixer
25 |       - id: requirements-txt-fixer
26 |       - id: double-quote-string-fixer
27 |       - id: check-merge-conflict
28 |       - id: fix-encoding-pragma
29 |         args: ["--remove"]
30 |       - id: mixed-line-ending
31 |         args: ["--fix=lf"]
32 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/assets/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/VLMEvalKit-main/assets/apple.jpg


--------------------------------------------------------------------------------
/VLMEvalKit-main/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | gradio==4.15.0
 3 | huggingface_hub
 4 | matplotlib
 5 | numpy>=1.23.4
 6 | omegaconf
 7 | openai==1.3.5
 8 | opencv-python>=4.4.0.46
 9 | openpyxl
10 | pandas>=1.5.3
11 | pillow
12 | portalocker
13 | protobuf
14 | pycocoevalcap
15 | python-dotenv
16 | requests
17 | rich
18 | seaborn
19 | sentencepiece
20 | sty
21 | tabulate
22 | tiktoken
23 | timeout-decorator
24 | torch>=2.0.1
25 | tqdm
26 | transformers
27 | typing_extensions==4.7.1
28 | validators
29 | visual_genome
30 | xlsxwriter
31 | xtuner
32 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/apires_scan.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from vlmeval import *
 3 | FAIL_MSG = 'Failed to obtain answer via API.'
 4 | 
 5 | root = sys.argv[1]
 6 | if root[-1] in '/\\':
 7 |     root = root[:-1]
 8 | 
 9 | model_name = root.split('/')[-1]
10 | datasets = list(dataset_URLs)
11 | 
12 | for d in datasets:
13 |     fname = f'{model_name}_{d}.xlsx'
14 |     pth = osp.join(root, fname)
15 |     if osp.exists(pth):
16 |         data = load(pth)
17 |         # Detect Failure
18 |         assert 'prediction' in data
19 |         data['prediction'] = [str(x) for x in data['prediction']]
20 |         fail = [FAIL_MSG in x for x in data['prediction']]
21 |         if sum(fail):
22 |             nfail = sum(fail)
23 |             ntot = len(fail)
24 |             print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
25 | 
26 |         eval_files = ls(root, match=f'{model_name}_{d}_')
27 |         eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')]
28 | 
29 |         if len(eval_files) == 0:
30 |             print(f'Model {model_name} x Dataset {d} openai missing')
31 |             continue
32 |         
33 |         assert len(eval_files) == 1
34 |         eval_file = eval_files[0]
35 |         data = load(eval_file)
36 |         
37 |         if 'MMVet' in d:
38 |             bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
39 |             if len(bad):
40 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
41 |         elif 'MathVista' in d:
42 |             bad = [x for x in data['res'] if FAIL_MSG in str(x)]
43 |             if len(bad):
44 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
45 |             
46 |         elif d == 'LLaVABench':
47 |             sub = data[data['gpt4_score'] == -1]
48 |             sub = sub[sub['gpt4_score'] == -1]
49 |             if len(sub):
50 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
51 |         else:
52 |             bad = [x for x in data['log'] if FAIL_MSG in str(x)]
53 |             if len(bad):
54 |                 print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
55 |                 


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/auto_run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from vlmeval.smp import *
 3 | from vlmeval.config import supported_VLM
 4 | 
 5 | def is_api(x):
 6 |     return getattr(supported_VLM[x].func, 'is_api', False)
 7 | 
 8 | models = list(supported_VLM)
 9 | models = [x for x in models if 'fs' not in x]
10 | models = [x for x in models if not is_api(x)]
11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2']
12 | models = [x for x in models if x not in exclude_list]
13 | 
14 | def is_large(x):
15 |     return '80b' in x or 'emu2' in x or '34B' in x
16 | 
17 | small_models = [x for x in models if not is_large(x)]
18 | large_models = [x for x in models if is_large(x)]
19 | models = small_models + large_models
20 | 
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--data', type=str, nargs='+', required=True)
23 | args = parser.parse_args()
24 | 
25 | # Skip some models
26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
27 | 
28 | for m in models:
29 |     unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
30 |     if len(unknown_datasets) == 0:
31 |         continue
32 |     dataset_str = ' '.join(unknown_datasets)
33 |     if '80b' in m:
34 |         cmd = f'python run.py --data {dataset_str} --model {m}'
35 |     else:
36 |         cmd = f'bash run.sh --data {dataset_str} --model {m}'
37 |     print(cmd)
38 |     os.system(cmd)


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/cover.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3 | cp $DIR/../config.py $DIR/../vlmeval/
4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | export GPU=$(nvidia-smi --list-gpus | wc -l)
4 | torchrun --nproc-per-node=$GPU run.py ${@:1}


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}


--------------------------------------------------------------------------------
/VLMEvalKit-main/scripts/summarize.py:
--------------------------------------------------------------------------------
  1 | from vlmeval.smp import *
  2 | from vlmeval.utils.dataset_config import dataset_URLs
  3 | 
  4 | def get_score(model, dataset):
  5 | 
  6 |     file_name = f'{model}/{model}_{dataset}'
  7 |     if listinstr([
  8 |         'CCBench', 'MMBench', 'SEEDBench_IMG', 'MMMU', 'ScienceQA', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset):
  9 |         file_name += '_acc.csv'
 10 |     elif listinstr(['MME', 'Hallusion', 'LLaVABench'], dataset):
 11 |         file_name += '_score.csv'
 12 |     elif listinstr(['MMVet', 'MathVista'], dataset):
 13 |         file_name += '_gpt-4-turbo_score.csv'
 14 |     elif listinstr(['COCO', 'OCRBench'], dataset):
 15 |         file_name += '_score.json'
 16 |     else:
 17 |         raise NotImplementedError
 18 |     
 19 |     if not osp.exists(file_name):
 20 |         return {}
 21 |     
 22 |     data = load(file_name)
 23 |     ret = {}
 24 |     if dataset == 'CCBench':
 25 |         ret[dataset] = data['Overall'][0] * 100
 26 |     elif dataset == 'MMBench':
 27 |         for n, a in zip(data['split'], data['Overall']):
 28 |             if n == 'dev':
 29 |                 ret['MMBench_DEV_EN'] = a * 100
 30 |             elif n == 'test':
 31 |                 ret['MMBench_TEST_EN'] = a * 100
 32 |     elif dataset == 'MMBench_CN':
 33 |         for n, a in zip(data['split'], data['Overall']):
 34 |             if n == 'dev':
 35 |                 ret['MMBench_DEV_CN'] = a * 100
 36 |             elif n == 'test':
 37 |                 ret['MMBench_TEST_CN'] = a * 100
 38 |     elif listinstr(['SEEDBench', 'ScienceQA', 'MMBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA'], dataset):
 39 |         ret[dataset] = data['Overall'][0] * 100
 40 |     elif 'MME' == dataset:
 41 |         ret[dataset] = data['perception'][0] + data['reasoning'][0]
 42 |     elif 'MMVet' == dataset:
 43 |         data = data[data['Category'] == 'Overall']
 44 |         ret[dataset] = float(data.iloc[0]['acc'])
 45 |     elif 'HallusionBench' == dataset:
 46 |         data = data[data['split'] == 'Overall']
 47 |         for met in ['aAcc', 'qAcc', 'fAcc']:
 48 |             ret[dataset + f' ({met})'] = float(data.iloc[0][met])
 49 |     elif 'MMMU' in dataset:
 50 |         data = data[data['split'] == 'validation']
 51 |         ret['MMMU (val)'] = float(data.iloc[0]['Overall']) * 100
 52 |     elif 'MathVista' in dataset:
 53 |         data = data[data['Task&Skill'] == 'Overall']
 54 |         ret[dataset] = float(data.iloc[0]['acc'])
 55 |     elif 'LLaVABench' in dataset:
 56 |         data = data[data['split'] == 'overall'].iloc[0]
 57 |         ret[dataset] = float(data['Relative Score (main)'])
 58 |     elif 'OCRBench' in dataset:
 59 |         ret[dataset] = data['Final Score']
 60 |      
 61 |     return ret
 62 | 
 63 | def parse_args():
 64 |     parser = argparse.ArgumentParser()
 65 |     parser.add_argument('--data', type=str, nargs='+', default=[])
 66 |     parser.add_argument("--model", type=str, nargs='+', required=True)
 67 |     args = parser.parse_args()
 68 |     return args
 69 | 
 70 | def gen_table(models, datasets):
 71 |     res = defaultdict(dict)
 72 |     for m in models:
 73 |         for d in datasets:
 74 |             try:
 75 |                 res[m].update(get_score(m, d))
 76 |             except:
 77 |                 pass
 78 |     keys = []
 79 |     for m in models:
 80 |         for d in res[m]:
 81 |             keys.append(d)
 82 |     keys = list(set(keys))
 83 |     keys.sort()
 84 |     final = defaultdict(list)
 85 |     for m in models:
 86 |         final['Model'].append(m)
 87 |         for k in keys:
 88 |             if k in res[m]:
 89 |                 final[k].append(res[m][k])
 90 |             else:
 91 |                 final[k].append(None)
 92 |     final = pd.DataFrame(final)
 93 |     dump(final, 'summ.csv')
 94 |     if len(final) >= len(final.iloc[0].keys()):
 95 |         print(tabulate(final))
 96 |     else:
 97 |         print(tabulate(final.T))
 98 |     
 99 | if __name__ == '__main__':
100 |     args = parse_args()
101 |     if args.data == []:
102 |         args.data = list(dataset_URLs)
103 |     gen_table(args.model, args.data)


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | except ImportError:
 4 |     pass
 5 | 
 6 | from .smp import *
 7 | from .api import *
 8 | from .evaluate import *
 9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | from .tools import cli
13 | 
14 | load_env()
15 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gpt import OpenAIWrapper, GPT4V
 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal
 3 | from .hf_chat_model import HFChatModel
 4 | from .gemini import GeminiWrapper, GeminiProVision
 5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI
 6 | from .qwen_api import QwenAPI
 7 | from .stepai import Step1V_INT
 8 | from .claude import Claude_Wrapper, Claude3V
 9 | from .reka import Reka
10 | from .glm_vision import GLMVisionAPI
11 | from .cloudwalk import CWWrapper
12 | 
13 | __all__ = [
14 |     'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper',
15 |     'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI',
16 |     'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI',
17 |     'CWWrapper'
18 | ]
19 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/claude.py:
--------------------------------------------------------------------------------
  1 | from vlmeval.smp import *
  2 | from vlmeval.api.base import BaseAPI
  3 | from time import sleep
  4 | import base64
  5 | import mimetypes
  6 | 
  7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
  8 | headers = {
  9 |     'alles-apin-token': '',
 10 |     'Content-Type': 'application/json'
 11 | }
 12 | 
 13 | 
 14 | class Claude_Wrapper(BaseAPI):
 15 | 
 16 |     is_api: bool = True
 17 | 
 18 |     def __init__(self,
 19 |                  model: str = 'claude-3-opus-20240229',
 20 |                  key: str = None,
 21 |                  retry: int = 10,
 22 |                  wait: int = 3,
 23 |                  system_prompt: str = None,
 24 |                  verbose: bool = True,
 25 |                  temperature: float = 0,
 26 |                  max_tokens: int = 1024,
 27 |                  **kwargs):
 28 | 
 29 |         self.model = model
 30 |         self.headers = headers
 31 |         self.temperature = temperature
 32 |         self.max_tokens = max_tokens
 33 |         if key is not None:
 34 |             self.key = key
 35 |         else:
 36 |             self.key = os.environ.get('ALLES', '')
 37 |         self.headers['alles-apin-token'] = self.key
 38 | 
 39 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
 40 | 
 41 |     def build_msgs(self, msgs_raw):
 42 | 
 43 |         messages = []
 44 |         message = {'role': 'user', 'content': []}
 45 |         for msg in msgs_raw:
 46 |             if msg['type'] == 'image':
 47 |                 pth = msg['value']
 48 |                 suffix = osp.splitext(pth)[-1].lower()
 49 |                 media_type = mimetypes.types_map.get(suffix, None)
 50 |                 assert media_type is not None
 51 | 
 52 |                 item = {
 53 |                     'type': 'image',
 54 |                     'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)}
 55 |                 }
 56 | 
 57 |             elif msg['type'] == 'text':
 58 |                 item = {'type': 'text', 'text': msg['value']}
 59 |             else:
 60 |                 raise NotImplementedError(f'Unsupported message type: {msg["type"]}')
 61 | 
 62 |             message['content'].append(item)
 63 |         messages.append(message)
 64 |         return messages
 65 | 
 66 |     def generate_inner(self, inputs, **kwargs) -> str:
 67 | 
 68 |         payload = json.dumps({
 69 |             'model': self.model,
 70 |             'max_tokens': self.max_tokens,
 71 |             'messages': self.build_msgs(msgs_raw=inputs),
 72 |             **kwargs
 73 |         })
 74 |         response = requests.request('POST', url, headers=headers, data=payload)
 75 | 
 76 |         ret_code = response.status_code
 77 |         retry = self.retry
 78 |         while ret_code == 429 and retry > 0:
 79 |             sleep(15)
 80 |             response = requests.request('POST', url, headers=headers, data=payload)
 81 |             ret_code = response.status_code
 82 |             retry -= 1
 83 | 
 84 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
 85 |         answer = self.fail_msg
 86 | 
 87 |         try:
 88 |             resp_struct = json.loads(response.text)
 89 |             answer = resp_struct['data']['content'][0]['text'].strip()
 90 |         except:
 91 |             pass
 92 | 
 93 |         return ret_code, answer, response
 94 | 
 95 | 
 96 | class Claude3V(Claude_Wrapper):
 97 | 
 98 |     def generate(self, message, dataset=None):
 99 |         return super(Claude_Wrapper, self).generate(message)
100 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/cloudwalk.py:
--------------------------------------------------------------------------------
  1 | from ..smp import *
  2 | import os
  3 | from .base import BaseAPI
  4 | 
  5 | 
  6 | class CWWrapper(BaseAPI):
  7 | 
  8 |     is_api: bool = True
  9 | 
 10 |     def __init__(self,
 11 |                  model: str = 'cw-congrong-v1.5',
 12 |                  retry: int = 10,
 13 |                  wait: int = 5,
 14 |                  key: str = None,
 15 |                  verbose: bool = True,
 16 |                  system_prompt: str = None,
 17 |                  temperature: float = 0,
 18 |                  timeout: int = 600,
 19 |                  api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
 20 |                  max_tokens: int = 1024,
 21 |                  img_size: int = 512,
 22 |                  img_detail: str = 'low',
 23 |                  **kwargs):
 24 | 
 25 |         self.model = model
 26 |         self.cur_idx = 0
 27 |         self.fail_msg = 'Failed to obtain answer via API. '
 28 |         self.max_tokens = max_tokens
 29 |         self.temperature = temperature
 30 | 
 31 |         base = os.environ.get('CW_API_BASE', None)
 32 |         self.api_base = base if base is not None else api_base
 33 | 
 34 |         env_key = os.environ.get('CW_API_KEY', None)
 35 |         self.key = env_key if env_key is not None else key
 36 |         assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
 37 |             pass it to the constructor.'
 38 | 
 39 |         assert img_size > 0 or img_size == -1
 40 |         self.img_size = -1  # allways send full size image
 41 |         assert img_detail in ['high', 'low']
 42 |         self.img_detail = img_detail
 43 | 
 44 |         self.vision = True
 45 |         self.timeout = timeout
 46 | 
 47 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
 48 | 
 49 |     # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
 50 |     # content can be a string or a list of image & text
 51 |     def prepare_inputs(self, inputs):
 52 |         input_msgs = []
 53 |         if self.system_prompt is not None:
 54 |             input_msgs.append(dict(role='system', content=self.system_prompt))
 55 |         has_images = np.sum([x['type'] == 'image' for x in inputs])
 56 |         if has_images:
 57 |             content_list = []
 58 |             for msg in inputs:
 59 |                 if msg['type'] == 'text':
 60 |                     content_list.append(dict(type='text', text=msg['value']))
 61 |                 elif msg['type'] == 'image':
 62 |                     from PIL import Image
 63 |                     img = Image.open(msg['value'])
 64 |                     b64 = encode_image_to_base64(img, target_size=self.img_size)
 65 |                     img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
 66 |                     content_list.append(dict(type='image_url', image_url=img_struct))
 67 |             input_msgs.append(dict(role='user', content=content_list))
 68 |         else:
 69 |             assert all([x['type'] == 'text' for x in inputs])
 70 |             text = '\n'.join([x['value'] for x in inputs])
 71 |             input_msgs.append(dict(role='user', content=text))
 72 |         return input_msgs
 73 | 
 74 |     def generate_inner(self, inputs, **kwargs) -> str:
 75 |         input_msgs = self.prepare_inputs(inputs)
 76 |         temperature = kwargs.pop('temperature', self.temperature)
 77 |         max_tokens = kwargs.pop('max_tokens', self.max_tokens)
 78 | 
 79 |         if 0 < max_tokens <= 100:
 80 |             self.logger.warning(
 81 |                 'Less than 100 tokens left, '
 82 |                 'may exceed the context window with some additional meta symbols. '
 83 |             )
 84 |         if max_tokens <= 0:
 85 |             return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
 86 | 
 87 |         headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
 88 |         payload = dict(
 89 |             model=self.model,
 90 |             messages=input_msgs,
 91 |             max_tokens=max_tokens,
 92 |             n=1,
 93 |             temperature=temperature,
 94 |             **kwargs)
 95 |         response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
 96 |         ret_code = response.status_code
 97 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
 98 |         answer = self.fail_msg
 99 |         try:
100 |             resp_struct = json.loads(response.text)
101 |             answer = resp_struct['choices'][0]['message']['content'].strip()
102 |         except:
103 |             pass
104 |         return ret_code, answer, response
105 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/glm_vision.py:
--------------------------------------------------------------------------------
  1 | from vlmeval.smp import *
  2 | from vlmeval.api.base import BaseAPI
  3 | from vlmeval.utils.dataset import DATASET_TYPE
  4 | from vlmeval.smp.vlm import encode_image_file_to_base64
  5 | 
  6 | 
  7 | class GLMVisionWrapper(BaseAPI):
  8 | 
  9 |     is_api: bool = True
 10 | 
 11 |     def __init__(self,
 12 |                  model: str,
 13 |                  retry: int = 5,
 14 |                  wait: int = 5,
 15 |                  key: str = None,
 16 |                  verbose: bool = True,
 17 |                  system_prompt: str = None,
 18 |                  max_tokens: int = 1024,
 19 |                  proxy: str = None,
 20 |                  **kwargs):
 21 | 
 22 |         self.model = model
 23 |         self.fail_msg = 'Failed to obtain answer via API. '
 24 |         self.default_params = {
 25 |             'top_p': 0.6,
 26 |             'top_k': 2,
 27 |             'temperature': 0.8,
 28 |             'repetition_penalty': 1.1,
 29 |             'best_of': 1,
 30 |             'do_sample': True,
 31 |             'stream': False,
 32 |             'max_tokens': max_tokens
 33 |         }
 34 |         if key is None:
 35 |             key = os.environ.get('GLMV_API_KEY', None)
 36 |         assert key is not None, (
 37 |             'Please set the API Key (obtain it here: '
 38 |             'https://open.bigmodel.cn/dev/howuse/introduction)'
 39 |         )
 40 |         self.key = key
 41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
 42 | 
 43 |     def image_to_base64(self, image_path):
 44 |         import base64
 45 |         with open(image_path, 'rb') as image_file:
 46 |             encoded_string = base64.b64encode(image_file.read())
 47 |             return encoded_string.decode('utf-8')
 48 | 
 49 |     def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
 50 |         msgs = cp.deepcopy(msgs_raw)
 51 |         content = []
 52 |         text = ''
 53 |         for i, msg in enumerate(msgs):
 54 |             if msg['type'] == 'text':
 55 |                 text += msg['value']
 56 |             elif msg['type'] == 'image':
 57 |                 content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
 58 |         if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
 59 |             text += '\nShort Answer.'
 60 |         content.append(dict(type='text', text=text))
 61 |         ret = [dict(role='user', content=content)]
 62 |         return ret
 63 | 
 64 |     def generate_inner(self, inputs, **kwargs) -> str:
 65 |         assert isinstance(inputs, str) or isinstance(inputs, list)
 66 |         inputs = [inputs] if isinstance(inputs, str) else inputs
 67 | 
 68 |         messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
 69 | 
 70 |         url = 'https://api.chatglm.cn/v1/chat/completions'
 71 |         headers = {
 72 |             'Content-Type': 'application/json',
 73 |             'Request-Id': 'remote-test',
 74 |             'Authorization': f'Bearer {self.key}'
 75 |         }
 76 |         payload = {
 77 |             'model': self.model,
 78 |             'messages': messages,
 79 |             **self.default_params
 80 |         }
 81 |         response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
 82 |         output = []
 83 |         try:
 84 |             assert response.status_code == 200
 85 |             for line in response.iter_lines():
 86 |                 data = json.loads(line.decode('utf-8').lstrip('data: '))
 87 |                 output.append(data['choices'][0]['message']['content'])
 88 |             answer = ''.join(output).replace('</s>', '')
 89 |             if self.verbose:
 90 |                 self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
 91 |             return 0, answer, 'Succeeded! '
 92 |         except Exception as err:
 93 |             if self.verbose:
 94 |                 self.logger.error(err)
 95 |                 self.logger.error(f'The input messages are {inputs}.')
 96 |             return -1, self.fail_msg, ''
 97 | 
 98 | 
 99 | class GLMVisionAPI(GLMVisionWrapper):
100 | 
101 |     def generate(self, message, dataset=None):
102 |         return super(GLMVisionAPI, self).generate(message, dataset=dataset)
103 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/gpt_int.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import warnings
 3 | import requests
 4 | from ..smp import *
 5 | from .gpt import GPT_context_window, OpenAIWrapper
 6 | 
 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat'
 8 | headers = {
 9 |     'Content-Type': 'application/json'
10 | }
11 | 
12 | 
13 | class OpenAIWrapperInternal(OpenAIWrapper):
14 | 
15 |     is_api: bool = True
16 | 
17 |     def __init__(self,
18 |                  model: str = 'gpt-3.5-turbo-0125',
19 |                  retry: int = 5,
20 |                  wait: int = 3,
21 |                  verbose: bool = True,
22 |                  system_prompt: str = None,
23 |                  temperature: float = 0,
24 |                  timeout: int = 60,
25 |                  max_tokens: int = 1024,
26 |                  img_size: int = 512,
27 |                  img_detail: str = 'low',
28 |                  **kwargs):
29 | 
30 |         self.model = model
31 |         if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']):
32 |             keys = load(os.environ['KEYS'])
33 |             headers['alles-apin-token'] = keys.get('alles-apin-token', '')
34 |         elif 'ALLES' in os.environ:
35 |             headers['alles-apin-token'] = os.environ['ALLES']
36 |         self.headers = headers
37 |         self.temperature = temperature
38 |         self.timeout = timeout
39 |         self.max_tokens = max_tokens
40 | 
41 |         assert img_size > 0 or img_size == -1
42 |         self.img_size = img_size
43 |         assert img_detail in ['high', 'low']
44 |         self.img_detail = img_detail
45 | 
46 |         super(OpenAIWrapper, self).__init__(
47 |             wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48 | 
49 |     def generate_inner(self, inputs, **kwargs) -> str:
50 |         input_msgs = self.prepare_inputs(inputs)
51 | 
52 |         temperature = kwargs.pop('temperature', self.temperature)
53 |         max_tokens = kwargs.pop('max_tokens', self.max_tokens)
54 | 
55 |         # Held out 100 tokens as buffer
56 |         context_window = GPT_context_window(self.model)
57 |         max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
58 |         if 0 < max_tokens <= 100:
59 |             print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ')
60 |         if max_tokens <= 0:
61 |             return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
62 | 
63 |         payload = dict(
64 |             model=self.model,
65 |             messages=input_msgs,
66 |             max_tokens=max_tokens,
67 |             n=1,
68 |             stop=None,
69 |             timeout=self.timeout,
70 |             temperature=temperature,
71 |             **kwargs)
72 | 
73 |         response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
74 |         ret_code = response.status_code
75 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
76 | 
77 |         answer = self.fail_msg
78 |         try:
79 |             resp_struct = json.loads(response.text)
80 |             assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct
81 |             answer = resp_struct['data']['choices'][0]['message']['content'].strip()
82 |         except:
83 |             pass
84 |         return ret_code, answer, response
85 | 
86 | 
87 | class GPT4V_Internal(OpenAIWrapperInternal):
88 | 
89 |     def generate(self, message, dataset=None):
90 |         return super(GPT4V_Internal, self).generate(message)
91 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/qwen_api.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | import os
 3 | from vlmeval.api.base import BaseAPI
 4 | from vlmeval.smp import *
 5 | 
 6 | 
 7 | # Note: This is a pure language model API.
 8 | class QwenAPI(BaseAPI):
 9 | 
10 |     is_api: bool = True
11 | 
12 |     def __init__(self,
13 |                  model: str = 'qwen-max-1201',
14 |                  retry: int = 5,
15 |                  wait: int = 5,
16 |                  verbose: bool = True,
17 |                  seed: int = 2680,
18 |                  temperature: float = 0.0,
19 |                  system_prompt: str = None,
20 |                  key: str = None,
21 |                  max_tokens: int = 1024,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 | 
25 |         assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26 |         self.model = model
27 |         import dashscope
28 |         self.fail_msg = 'Failed to obtain answer via API. '
29 |         self.max_tokens = max_tokens
30 |         self.temperature = temperature
31 |         self.seed = seed
32 |         if key is None:
33 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
34 |         assert key is not None, (
35 |             'Please set the API Key (obtain it here: '
36 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37 |         )
38 |         dashscope.api_key = key
39 |         if proxy is not None:
40 |             proxy_set(proxy)
41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 | 
43 |     @staticmethod
44 |     def build_msgs(msgs_raw, system_prompt=None):
45 |         msgs = cp.deepcopy(msgs_raw)
46 |         ret = []
47 |         if system_prompt is not None:
48 |             ret.append(dict(role='system', content=system_prompt))
49 |         for i, msg in enumerate(msgs):
50 |             role = 'user' if i % 2 == 0 else 'assistant'
51 |             ret.append(dict(role=role, content=msg))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         inputs = [inputs] if isinstance(inputs, str) else inputs
58 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59 | 
60 |         import dashscope
61 |         response = dashscope.Generation.call(
62 |             model=self.model,
63 |             messages=messages,
64 |             seed=self.seed,
65 |             temperature=self.temperature,
66 |             max_tokens=self.max_tokens,
67 |             result_format='message',  # set the result to be "message" format.
68 |         )
69 |         if response.status_code != HTTPStatus.OK:
70 |             return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71 | 
72 |         try:
73 |             return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74 |         except Exception as err:
75 |             return -1, f'Error: Failed to parse the response. {err}', response
76 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/qwen_vl_api.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | 
 4 | 
 5 | class QwenVLWrapper(BaseAPI):
 6 | 
 7 |     is_api: bool = True
 8 | 
 9 |     def __init__(self,
10 |                  model: str = 'qwen-vl-plus',
11 |                  retry: int = 5,
12 |                  wait: int = 5,
13 |                  key: str = None,
14 |                  verbose: bool = True,
15 |                  temperature: float = 0.0,
16 |                  system_prompt: str = None,
17 |                  max_tokens: int = 1024,
18 |                  proxy: str = None,
19 |                  **kwargs):
20 | 
21 |         assert model in ['qwen-vl-plus', 'qwen-vl-max']
22 |         self.model = model
23 |         import dashscope
24 |         self.fail_msg = 'Failed to obtain answer via API. '
25 |         self.max_tokens = max_tokens
26 |         self.temperature = temperature
27 |         if key is None:
28 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
29 |         assert key is not None, (
30 |             'Please set the API Key (obtain it here: '
31 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
32 |         )
33 |         dashscope.api_key = key
34 |         if proxy is not None:
35 |             proxy_set(proxy)
36 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
37 | 
38 |     @staticmethod
39 |     def build_msgs(msgs_raw, system_prompt=None):
40 |         msgs = cp.deepcopy(msgs_raw)
41 |         ret = []
42 |         if system_prompt is not None:
43 |             content = list(dict(text=system_prompt))
44 |             ret.append(dict(role='system', content=content))
45 |         content = []
46 |         for msg in msgs:
47 |             if msg['type'] == 'text':
48 |                 content.append(dict(text=msg['value']))
49 |             elif msg['type'] == 'image':
50 |                 content.append(dict(image='file://' + msg['value']))
51 |         ret.append(dict(role='user', content=content))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         pure_text = np.all([x['type'] == 'text' for x in inputs])
58 |         assert not pure_text
59 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
60 |         gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
61 |         gen_config.update(kwargs)
62 |         try:
63 |             response = MultiModalConversation.call(model=self.model, messages=messages)
64 |             if self.verbose:
65 |                 print(response)
66 |             answer = response.output.choices[0]['message']['content'][0]['text']
67 |             return 0, answer, 'Succeeded! '
68 |         except Exception as err:
69 |             if self.verbose:
70 |                 self.logger.error(err)
71 |                 self.logger.error(f'The input messages are {inputs}.')
72 | 
73 |             return -1, '', ''
74 | 
75 | 
76 | class QwenVLAPI(QwenVLWrapper):
77 | 
78 |     def generate(self, message, dataset=None):
79 |         return super(QwenVLAPI, self).generate(message)
80 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | from time import sleep
 4 | import mimetypes
 5 | 
 6 | 
 7 | class Reka_Wrapper(BaseAPI):
 8 | 
 9 |     is_api: bool = True
10 |     INTERLEAVE: bool = False
11 | 
12 |     def __init__(self,
13 |                  model: str = 'reka-flash-20240226',
14 |                  key: str = None,
15 |                  retry: int = 10,
16 |                  wait: int = 3,
17 |                  system_prompt: str = None,
18 |                  verbose: bool = True,
19 |                  temperature: float = 0,
20 |                  max_tokens: int = 1024,
21 |                  **kwargs):
22 | 
23 |         try:
24 |             import reka
25 |         except ImportError:
26 |             raise ImportError('Please install reka by running "pip install reka-api"')
27 | 
28 |         self.model = model
29 |         default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 |         default_kwargs.update(kwargs)
31 |         self.kwargs = default_kwargs
32 |         if key is not None:
33 |             self.key = key
34 |         else:
35 |             self.key = os.environ.get('REKA_API_KEY', '')
36 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 | 
38 |     def generate_inner(self, inputs, **kwargs) -> str:
39 |         import reka
40 |         reka.API_KEY = self.key
41 |         prompt, image_path = self.message_to_promptimg(inputs)
42 |         image_b64 = encode_image_file_to_base64(image_path)
43 | 
44 |         response = reka.chat(
45 |             model_name=self.model,
46 |             human=prompt,
47 |             media_url=f'data:image/jpeg;base64,{image_b64}',
48 |             **self.kwargs)
49 | 
50 |         try:
51 |             return 0, response['text'], response
52 |         except:
53 |             return -1, self.fail_msg, response
54 | 
55 | 
56 | class Reka(Reka_Wrapper):
57 | 
58 |     def generate(self, message, dataset=None):
59 |         return super(Reka_Wrapper, self).generate(message)
60 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/api/stepai.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | 
 4 | url = 'https://api.stepfun.com/v1/chat/completions'
 5 | headers = {
 6 |     'Content-Type': 'application/json',
 7 |     'Authorization': 'Bearer {}',
 8 | }
 9 | 
10 | 
11 | class StepAPI_INT(BaseAPI):
12 | 
13 |     is_api: bool = True
14 | 
15 |     def __init__(self,
16 |                  model: str = 'step-1v-8k',
17 |                  retry: int = 10,
18 |                  wait: int = 3,
19 |                  key: str = None,
20 |                  temperature: float = 0,
21 |                  max_tokens: int = 300,
22 |                  verbose: bool = True,
23 |                  system_prompt: str = None,
24 |                  **kwargs):
25 |         self.model = model
26 |         self.fail_msg = 'Fail to obtain answer via API.'
27 |         self.headers = headers
28 |         self.temperature = temperature
29 |         self.max_tokens = max_tokens
30 |         self.system_prompt = system_prompt
31 |         if key is not None:
32 |             self.key = key
33 |         else:
34 |             self.key = os.environ.get('STEPAI_API_KEY', '')
35 |         headers['Authorization'] = headers['Authorization'].format(self.key)
36 | 
37 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
38 | 
39 |     @staticmethod
40 |     def build_msgs(msgs_raw):
41 |         messages = []
42 |         message = {'role': 'user', 'content': []}
43 | 
44 |         for msg in msgs_raw:
45 |             if msg['type'] == 'image':
46 |                 image_b64 = encode_image_file_to_base64(msg['value'])
47 |                 message['content'].append({
48 |                     'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
49 |                     'type': 'image_url'
50 |                 })
51 |             elif msg['type'] == 'text':
52 |                 message['content'].append({
53 |                     'text': msg['value'],
54 |                     'type': 'text'
55 |                 })
56 | 
57 |         messages.append(message)
58 |         return messages
59 | 
60 |     def generate_inner(self, inputs, **kwargs) -> str:
61 |         print(inputs, '\n')
62 |         payload = dict(
63 |             model=self.model,
64 |             max_tokens=self.max_tokens,
65 |             temperature=self.temperature,
66 |             messages=self.build_msgs(msgs_raw=inputs),
67 |             **kwargs)
68 |         response = requests.post(url, headers=headers, data=json.dumps(payload))
69 |         ret_code = response.status_code
70 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
71 | 
72 |         answer = self.fail_msg
73 |         try:
74 |             resp_struct = json.loads(response.text)
75 |             answer = resp_struct['choices'][0]['message']['content'].strip()
76 |         except:
77 |             pass
78 |         return ret_code, answer, response
79 | 
80 | 
81 | class Step1V_INT(StepAPI_INT):
82 | 
83 |     def generate(self, message, dataset=None):
84 |         return super(StepAPI_INT, self).generate(message)
85 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/evaluate/OCRBench.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | 
 3 | 
 4 | def OCRBench_eval(eval_file):
 5 |     OCRBench_score = {
 6 |         'Regular Text Recognition': 0,
 7 |         'Irregular Text Recognition': 0,
 8 |         'Artistic Text Recognition': 0,
 9 |         'Handwriting Recognition': 0,
10 |         'Digit String Recognition': 0,
11 |         'Non-Semantic Text Recognition': 0,
12 |         'Scene Text-centric VQA': 0,
13 |         'Doc-oriented VQA': 0,
14 |         'Key Information Extraction': 0,
15 |         'Handwritten Mathematical Expression Recognition': 0
16 |     }
17 | 
18 |     logger = get_logger('Evaluation')
19 | 
20 |     data = load(eval_file)
21 |     lt = len(data)
22 |     lines = [data.iloc[i] for i in range(lt)]
23 |     for i in tqdm(range(len(lines))):
24 |         line = lines[i]
25 |         predict = str(line['prediction'])
26 |         answers = eval(line['answer'])
27 |         category = line['category']
28 |         if category == 'Handwritten Mathematical Expression Recognition':
29 |             for j in range(len(answers)):
30 |                 answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31 |                 predict = predict.strip().replace('\n', ' ').replace(' ', '')
32 |                 if answer in predict:
33 |                     OCRBench_score[category] += 1
34 |                     break
35 |         else:
36 |             for j in range(len(answers)):
37 |                 answer = answers[j].lower().strip().replace('\n', ' ')
38 |                 predict = predict.lower().strip().replace('\n', ' ')
39 |                 if answer in predict:
40 |                     OCRBench_score[category] += 1
41 |                     break
42 | 
43 |     final_score_dict = {}
44 |     final_score_dict['Text Recognition'] = (
45 |         OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46 |         + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47 |         + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48 |     )
49 |     final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50 |     final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51 |     final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52 |     final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53 |         OCRBench_score['Handwritten Mathematical Expression Recognition']
54 |     final_score_dict['Final Score'] = (
55 |         final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56 |         + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57 |         + final_score_dict['Handwritten Mathematical Expression Recognition']
58 |     )
59 |     final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60 |     score_pth = eval_file.replace('.xlsx', '_score.json')
61 |     dump(final_score_dict, score_pth)
62 |     logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63 |     logger.info('Score: ')
64 |     for key, value in final_score_dict.items():
65 |         logger.info('{}:{}'.format(key, value))
66 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/evaluate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval
 2 | from .mmvet_eval import MMVet_eval
 3 | from .multiple_choice import multiple_choice_eval
 4 | from .coco_eval import COCO_eval
 5 | from .vqa_eval import VQAEval
 6 | from .mathvista_eval import MathVista_eval
 7 | from .llavabench import LLaVABench_eval
 8 | from .misc import build_judge
 9 | from .OCRBench import OCRBench_eval
10 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/evaluate/coco_eval.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from pycocoevalcap.bleu.bleu import Bleu
 3 | from pycocoevalcap.rouge.rouge import Rouge
 4 | from pycocoevalcap.cider.cider import Cider
 5 | 
 6 | 
 7 | class COCO_Caption_Scorer():
 8 |     def __init__(self, ref, gt):
 9 |         self.ref = ref
10 |         self.gt = gt
11 |         print('setting up scorers...')
12 |         self.scorers = [
13 |             (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
14 |             # (Meteor(), "METEOR"), # need java version 11.0.16+
15 |             (Rouge(), 'ROUGE_L'),
16 |             (Cider(), 'CIDEr'),
17 |             # (Spice(), "SPICE"), # need java version 11.0.16+
18 |         ]
19 | 
20 |     def compute_scores(self):
21 |         total_scores = {}
22 |         for scorer, method in self.scorers:
23 |             print('computing %s score...' % (scorer.method()))
24 |             score, scores = scorer.compute_score(self.gt, self.ref)
25 |             if type(method) == list:
26 |                 for sc, scs, m in zip(score, scores, method):
27 |                     print('%s: %0.3f' % (m, sc * 100))
28 |                 total_scores['Bleu'] = [x * 100 for x in score]
29 |             else:
30 |                 print('%s: %0.3f' % (method, score * 100))
31 |                 total_scores[method] = score * 100
32 | 
33 |         print('*****DONE*****')
34 |         for key, value in total_scores.items():
35 |             print('{}:{}'.format(key, value))
36 |         return total_scores
37 | 
38 | 
39 | def COCO_eval(eval_file, nproc=4, verbose=False):
40 |     logger = get_logger('Evaluation')
41 | 
42 |     data = load(eval_file)
43 | 
44 |     lt = len(data)
45 |     lines = [data.iloc[i] for i in range(lt)]
46 |     ref = {}
47 |     gt = {}
48 |     for i, line in enumerate(lines):
49 |         ref[str(i)] = [str(line['prediction'])]
50 |         gt[str(i)] = eval(line['answer'])
51 | 
52 |     scorer = COCO_Caption_Scorer(ref, gt)
53 |     coco_caption_score_dict = scorer.compute_scores()
54 | 
55 |     score_pth = eval_file.replace('.xlsx', '_score.json')
56 |     dump(coco_caption_score_dict, score_pth)
57 |     logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
58 |     logger.info('Score: ')
59 |     for key, value in coco_caption_score_dict.items():
60 |         logger.info('{}:{}'.format(key, value))
61 | 
62 | 
63 | def parse_args():
64 |     parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
65 |     parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
66 |     parser.add_argument('--nproc', type=int, default=4)
67 |     parser.add_argument('--verbose', action='store_true')
68 |     args = parser.parse_args()
69 |     return args
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = parse_args()
74 |     COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
75 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/evaluate/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
 3 | from vlmeval.smp import load_env
 4 | 
 5 | INTERNAL = os.environ.get('INTERNAL', 0)
 6 | 
 7 | 
 8 | def build_judge(**kwargs):
 9 |     model = kwargs.pop('model', None)
10 |     load_env()
11 |     LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
12 |     if LOCAL_LLM is None:
13 |         model_map = {
14 |             'gpt-4-turbo': 'gpt-4-1106-preview',
15 |             'gpt-4-0613': 'gpt-4-0613',
16 |             'gpt-4-0125': 'gpt-4-0125-preview',
17 |             'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
18 |             'chatgpt-1106': 'gpt-3.5-turbo-1106',
19 |             'chatgpt-0125': 'gpt-3.5-turbo-0125',
20 |         }
21 |         model_version = model_map[model]
22 |     else:
23 |         model_version = LOCAL_LLM
24 |     if INTERNAL:
25 |         model = OpenAIWrapperInternal(model_version, **kwargs)
26 |     else:
27 |         model = OpenAIWrapper(model_version, **kwargs)
28 |     return model
29 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger_initialized = {}
 4 | 
 5 | 
 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
 7 |     logger = logging.getLogger(name)
 8 |     if name in logger_initialized:
 9 |         return logger
10 | 
11 |     for logger_name in logger_initialized:
12 |         if name.startswith(logger_name):
13 |             return logger
14 | 
15 |     stream_handler = logging.StreamHandler()
16 |     handlers = [stream_handler]
17 | 
18 |     try:
19 |         import torch.distributed as dist
20 |         if dist.is_available() and dist.is_initialized():
21 |             rank = dist.get_rank()
22 |         else:
23 |             rank = 0
24 |     except ImportError:
25 |         rank = 0
26 | 
27 |     if rank == 0 and log_file is not None:
28 |         file_handler = logging.FileHandler(log_file, file_mode)
29 |         handlers.append(file_handler)
30 | 
31 |     formatter = logging.Formatter(
32 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33 |     for handler in handlers:
34 |         handler.setFormatter(formatter)
35 |         handler.setLevel(log_level)
36 |         logger.addHandler(handler)
37 | 
38 |     if rank == 0:
39 |         logger.setLevel(log_level)
40 |     else:
41 |         logger.setLevel(logging.ERROR)
42 | 
43 |     logger_initialized[name] = True
44 |     return logger
45 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/smp/vlm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import pandas as pd
  4 | import numpy as np
  5 | import string
  6 | from uuid import uuid4
  7 | import os.path as osp
  8 | import base64
  9 | from PIL import Image
 10 | from .file import load, dump
 11 | Image.MAX_IMAGE_PIXELS = 1e9
 12 | 
 13 | 
 14 | def mmqa_display(question, target_size=512):
 15 |     question = {k.lower(): v for k, v in question.items()}
 16 |     keys = list(question.keys())
 17 |     keys = [k for k in keys if k not in ['index', 'image']]
 18 | 
 19 |     images = question['image']
 20 |     if isinstance(images, str):
 21 |         images = [images]
 22 | 
 23 |     idx = question.pop('index', 'XXX')
 24 |     print(f'INDEX: {idx}')
 25 | 
 26 |     for im in images:
 27 |         image = decode_base64_to_image(im, target_size=target_size)
 28 |         display(image)  # noqa: F821
 29 | 
 30 |     for k in keys:
 31 |         try:
 32 |             if not pd.isna(question[k]):
 33 |                 print(f'{k.upper()}. {question[k]}')
 34 |         except ValueError:
 35 |             if False in pd.isna(question[k]):
 36 |                 print(f'{k.upper()}. {question[k]}')
 37 | 
 38 | 
 39 | def encode_image_to_base64(img, target_size=-1):
 40 |     # if target_size == -1, will not do resizing
 41 |     # else, will set the max_size ot (target_size, target_size)
 42 |     if img.mode in ('RGBA', 'P'):
 43 |         img = img.convert('RGB')
 44 |     tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
 45 |     if target_size > 0:
 46 |         img.thumbnail((target_size, target_size))
 47 |     img.save(tmp)
 48 |     with open(tmp, 'rb') as image_file:
 49 |         image_data = image_file.read()
 50 |     ret = base64.b64encode(image_data).decode('utf-8')
 51 |     os.remove(tmp)
 52 |     return ret
 53 | 
 54 | 
 55 | def encode_image_file_to_base64(image_path, target_size=-1):
 56 |     image = Image.open(image_path)
 57 |     return encode_image_to_base64(image, target_size=target_size)
 58 | 
 59 | 
 60 | def decode_base64_to_image(base64_string, target_size=-1):
 61 |     image_data = base64.b64decode(base64_string)
 62 |     image = Image.open(io.BytesIO(image_data))
 63 |     if image.mode in ('RGBA', 'P'):
 64 |         image = image.convert('RGB')
 65 |     if target_size > 0:
 66 |         image.thumbnail((target_size, target_size))
 67 |     return image
 68 | 
 69 | 
 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
 71 |     image = decode_base64_to_image(base64_string, target_size=target_size)
 72 |     image.save(image_path)
 73 | 
 74 | 
 75 | def build_option_str(option_dict):
 76 |     s = 'There are several options: \n'
 77 |     for c, content in option_dict.items():
 78 |         if not pd.isna(content):
 79 |             s += f'{c}. {content}\n'
 80 |     return s
 81 | 
 82 | 
 83 | def isimg(s):
 84 |     return osp.exists(s) or s.startswith('http')
 85 | 
 86 | 
 87 | def read_ok(img_path):
 88 |     if not osp.exists(img_path):
 89 |         return False
 90 |     try:
 91 |         im = Image.open(img_path)
 92 |         assert im.size[0] > 0 and im.size[1] > 0
 93 |         return True
 94 |     except:
 95 |         return False
 96 | 
 97 | 
 98 | def gpt_key_set():
 99 |     openai_key = os.environ.get('OPENAI_API_KEY', None)
100 |     return isinstance(openai_key, str) and openai_key.startswith('sk-')
101 | 
102 | 
103 | def apiok(wrapper):
104 |     s = wrapper.generate('Hello!')
105 |     return wrapper.fail_msg not in s
106 | 
107 | 
108 | def circular_pred(df, extract_func=None):
109 |     if extract_func is None:
110 |         extract_func = lambda x: x  # noqa: E731
111 |     df = df.sort_values('index')
112 |     from vlmeval.utils import can_infer_option
113 |     shift = int(1e6)
114 | 
115 |     choices = [extract_func(x) for x in df['prediction']]
116 |     pred_map = {i: c for i, c in zip(df['index'], choices)}
117 |     flag_map = {i: True for i in pred_map if i < 1e6}
118 |     valid_map = {i: True for i in pred_map if i < 1e6}
119 |     for i in df['index']:
120 |         if i >= shift and pred_map[i] and pred_map[i - shift]:
121 |             if (
122 |                 pred_map[i] not in list(string.ascii_uppercase) or  # noqa: W504
123 |                 pred_map[i - shift] not in list(string.ascii_uppercase)
124 |             ):
125 | 
126 |                 valid_map[i % shift] = False
127 |                 continue
128 |             if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
129 |                 continue
130 |             else:
131 |                 flag_map[i % shift] = False
132 |     flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
133 |     flags = list(flag_map.values())
134 |     return np.mean(flags)
135 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .matching_util import can_infer, can_infer_option, can_infer_text
 2 | from .mp_util import track_progress_rich
 3 | from .custom_prompt import CustomPrompt
 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
 5 | from .dataset import TSVDataset, split_MMMU
 6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer
 7 | 
 8 | 
 9 | __all__ = [
10 |     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
11 |     'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
12 |     'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer'
13 | ]
14 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/utils/custom_prompt.py:
--------------------------------------------------------------------------------
 1 | from ..smp import *
 2 | from .dataset_config import img_root_map
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class CustomPrompt:
 7 | 
 8 |     @abstractmethod
 9 |     def use_custom_prompt(self, dataset):
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def build_prompt(self, line, dataset):
14 |         raise NotImplementedError
15 | 
16 |     def dump_image(self, line, dataset):
17 |         ROOT = LMUDataRoot()
18 |         assert isinstance(dataset, str)
19 |         img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
20 |         os.makedirs(img_root, exist_ok=True)
21 | 
22 |         if 'image' in line:
23 |             if isinstance(line['image'], list):
24 |                 tgt_path = []
25 |                 assert 'image_path' in line
26 |                 for img, im_name in zip(line['image'], line['image_path']):
27 |                     path = osp.join(img_root, im_name)
28 |                     if not read_ok(path):
29 |                         decode_base64_to_image_file(img, path)
30 |                     tgt_path.append(path)
31 |             else:
32 |                 tgt_path = osp.join(img_root, f"{line['index']}.jpg")
33 |                 if not read_ok(tgt_path):
34 |                     decode_base64_to_image_file(line['image'], tgt_path)
35 |                 tgt_path = [tgt_path]
36 |         else:
37 |             assert 'image_path' in line
38 |             tgt_path = toliststr(line['image_path'])
39 | 
40 |         return tgt_path
41 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import copy as cp
 3 | import os
 4 | from ..smp import *
 5 | 
 6 | 
 7 | def can_infer_option(answer, choices):
 8 |     verbose = os.environ.get('VERBOSE', 0)
 9 |     # Choices is a dictionary
10 |     if 'Failed to obtain answer via API' in answer:
11 |         return False
12 | 
13 |     reject_to_answer = [
14 |         "Sorry, I can't help with images of people yet.",
15 |         "I can't process this file.",
16 |         "I'm sorry, but without the image provided",
17 |         'Cannot determine the answer'
18 |     ]
19 |     for err in reject_to_answer:
20 |         if err in answer:
21 |             return 'Z'
22 | 
23 |     def count_choice(splits, choices, prefix='', suffix=''):
24 |         cnt = 0
25 |         for c in choices:
26 |             if prefix + c + suffix in splits:
27 |                 cnt += 1
28 |         return cnt
29 | 
30 |     answer_mod = cp.copy(answer)
31 |     chars = '.()[],:;!*#{}'
32 |     for c in chars:
33 |         answer_mod = answer_mod.replace(c, ' ')
34 | 
35 |     splits = [x.strip() for x in answer_mod.split()]
36 |     count = count_choice(splits, choices)
37 | 
38 |     if count == 1:
39 |         for ch in choices:
40 |             if 'A' in splits and len(splits) > 3 and verbose:
41 |                 logger = get_logger('Evaluation')
42 |                 logger.info(f'A might be a quantifier in the string: {answer}.')
43 |                 return False
44 |             if ch in splits:
45 |                 return ch
46 |     elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 |         return 'Z'
48 |     return False
49 | 
50 | 
51 | def can_infer_text(answer, choices):
52 |     answer = answer.lower()
53 |     assert isinstance(choices, dict)
54 |     for k in choices:
55 |         assert k in string.ascii_uppercase
56 |         choices[k] = str(choices[k]).lower()
57 |     cands = []
58 |     for k in choices:
59 |         if choices[k] in answer:
60 |             cands.append(k)
61 |     if len(cands) == 1:
62 |         return cands[0]
63 |     return False
64 | 
65 | 
66 | def can_infer(answer, choices):
67 |     answer = str(answer)
68 |     copt = can_infer_option(answer, choices)
69 |     return copt if copt else can_infer_text(answer, choices)
70 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/utils/result_transfer.py:
--------------------------------------------------------------------------------
 1 | from ..evaluate.misc import build_judge
 2 | from ..evaluate.multiple_choice import extract_answer_from_item
 3 | 
 4 | from ..smp import *
 5 | from .matching_util import can_infer
 6 | from .mp_util import track_progress_rich
 7 | 
 8 | 
 9 | def MMMU_result_transfer(result_path):
10 |     res = {}
11 |     result_data = load(result_path)
12 |     mcq = result_data['A'].notna()
13 |     lt = len(result_data)
14 |     for i in range(lt):
15 |         line = result_data.iloc[i]
16 |         if mcq[i]:
17 |             options = {
18 |                 cand: line[cand]
19 |                 for cand in string.ascii_uppercase
20 |                 if cand in line and not pd.isna(line[cand])
21 |             }
22 |             prediction = line['prediction']
23 |             infer_prediction = can_infer(prediction, options)
24 |             res[line['id']] = infer_prediction
25 |         else:
26 |             res[line['id']] = line['prediction']
27 |     result_json = result_path.replace('.xlsx', '.json')
28 |     dump(res, result_json)
29 |     return result_json
30 | 
31 | 
32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
33 |     logger = get_logger('Evaluation')
34 |     INTERNAL = os.environ.get('INTERNAL', 0)
35 |     nproc = judge_kwargs.pop('nproc', 4)
36 | 
37 |     rd.seed(2680)
38 |     suffix = eval_file.split('.')[-1]
39 |     model = judge_kwargs['model']
40 |     assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
41 |     name_str_map = {
42 |         'chatgpt-0125': 'openai',
43 |         'gpt-4-0125': 'gpt4'
44 |     }
45 |     name_str = name_str_map[model] if model in name_str_map else model
46 | 
47 |     if model == 'exact_matching':
48 |         model = None
49 |     else:
50 |         if INTERNAL or gpt_key_set():
51 |             model = build_judge(**judge_kwargs)
52 |         else:
53 |             logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
54 |             model = None
55 | 
56 |     logger.info(f'Evaluating {eval_file}')
57 |     result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
58 |     result = {}
59 |     if osp.exists(result_file):
60 |         result = load(result_file)
61 | 
62 |     data = load(eval_file)
63 |     assert 'index' in data, 'Essentail columns missing in the eval_file.'
64 | 
65 |     data = data.sort_values(by='index')
66 |     data['prediction'] = [str(x) for x in data['prediction']]
67 |     for k in data.keys():
68 |         data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
69 | 
70 |     idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
71 |     idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
72 | 
73 |     indices = list(idx2lines.keys())
74 |     lines = [idx2lines[i] for i in indices]
75 |     tups = [(model, line) for line in lines]
76 |     res = track_progress_rich(
77 |         extract_answer_from_item,
78 |         tups,
79 |         nproc=nproc,
80 |         chunksize=nproc,
81 |         save=result_file,
82 |         keys=indices)
83 | 
84 |     for i, r in zip(indices, res):
85 |         if i in result:
86 |             assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
87 |         else:
88 |             result[i] = r
89 | 
90 |     indices = list(data['index'])
91 |     data['opt'] = [result[i]['opt'] for i in data['index']]
92 |     data['log'] = [result[i]['log'] for i in data['index']]
93 | 
94 |     # load split
95 |     output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
96 |     dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
97 |     return output_path
98 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | torch.set_grad_enabled(False)
 4 | torch.manual_seed(1234)
 5 | from .base import BaseModel
 6 | from .cogvlm import CogVlm, GLM4v
 7 | from .emu import Emu
 8 | from .idefics import IDEFICS, IDEFICS2
 9 | from .instructblip import InstructBLIP
10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner
11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V
12 | from .minigpt4 import MiniGPT4
13 | from .mmalaya import MMAlaya
14 | from .monkey import Monkey, MonkeyChat
15 | from .mplug_owl2 import mPLUG_Owl2
16 | from .omnilmm import OmniLMM12B
17 | from .open_flamingo import OpenFlamingo
18 | from .pandagpt import PandaGPT
19 | from .qwen_vl import QwenVL, QwenVLChat
20 | from .transcore_m import TransCoreM
21 | from .visualglm import VisualGLM
22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD
23 | from .yi_vl import Yi_VL
24 | from .internvl_chat import InternVLChat
25 | from .deepseek_vl import DeepSeekVL
26 | from .mgm import Mini_Gemini
27 | from .bunnyllama3 import BunnyLLama3
28 | from .vxverse import VXVERSE
29 | from .paligemma import PaliGemma
30 | from .qh_360vl import QH_360VL
31 | from .phi3_vision import Phi3Vision
32 | from .wemm import WeMM
33 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/bunnyllama3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from PIL import Image
 5 | import warnings
 6 | 
 7 | from .base import BaseModel
 8 | from ..smp import *
 9 | from ..utils import DATASET_TYPE
10 | 
11 | 
12 | class BunnyLLama3(BaseModel):
13 | 
14 |     INSTALL_REQ = False
15 |     INTERLEAVE = False
16 | 
17 |     def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs):
18 |         assert model_path is not None
19 |         transformers.logging.set_verbosity_error()
20 |         transformers.logging.disable_progress_bar()
21 |         warnings.filterwarnings('ignore')
22 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
23 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
24 |         self.kwargs = kwargs
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         prompt, image_path = self.message_to_promptimg(message)
28 |         text = f"A chat between a curious user and an artificial intelligence assistant. \
29 |             The assistant gives helpful, detailed, and polite answers to the user's questions. \
30 |             USER: <image>\n{prompt} ASSISTANT:"
31 |         text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
32 |         input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
33 |         image = Image.open(image_path).convert('RGB')
34 |         image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype)
35 | 
36 |         output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0]
37 |         response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
38 |         return response
39 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/deepseek_vl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | from transformers import AutoModelForCausalLM
 4 | import warnings
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class DeepSeekVL(BaseModel):
 9 | 
10 |     INSTALL_REQ = True
11 |     INTERLEAVE = True
12 | 
13 |     def check_install(self):
14 |         try:
15 |             import deepseek_vl
16 |         except ImportError:
17 |             warnings.warn(
18 |                 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
19 |             sys.exit(-1)
20 | 
21 |     def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs):
22 |         self.check_install()
23 |         assert model_path is not None
24 |         self.model_path = model_path
25 |         from deepseek_vl.models import VLChatProcessor
26 | 
27 |         self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
28 |         self.tokenizer = self.vl_chat_processor.tokenizer
29 | 
30 |         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
31 |         self.model = model.to(torch.bfloat16).cuda().eval()
32 | 
33 |         torch.cuda.empty_cache()
34 |         default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
35 |         default_kwargs.update(kwargs)
36 |         self.kwargs = default_kwargs
37 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
38 | 
39 |     def prepare_inputs(self, message):
40 |         content, images = '', []
41 |         for s in message:
42 |             if s['type'] == 'image':
43 |                 images.append(s['value'])
44 |                 content += '<image_placeholder>'
45 |             elif s['type'] == 'text':
46 |                 content += s['value']
47 |         conversation = [
48 |             dict(role='User', content=content, images=images),
49 |             dict(role='Assistant', content='')
50 |         ]
51 |         return conversation
52 | 
53 |     def generate_inner(self, message, dataset=None):
54 |         conversation = self.prepare_inputs(message)
55 |         from deepseek_vl.utils.io import load_pil_images
56 |         pil_images = load_pil_images(conversation)
57 |         prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
58 |         prepare_inputs = prepare_inputs.to(self.model.device)
59 |         inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
60 | 
61 |         outputs = self.model.language_model.generate(
62 |             inputs_embeds=inputs_embeds,
63 |             attention_mask=prepare_inputs.attention_mask,
64 |             pad_token_id=self.tokenizer.eos_token_id,
65 |             bos_token_id=self.tokenizer.bos_token_id,
66 |             eos_token_id=self.tokenizer.eos_token_id,
67 |             **self.kwargs)
68 |         answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
69 |         return answer
70 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/emu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from PIL import Image
 4 | import os.path as osp
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class Emu(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self,
15 |                  model_path='BAAI/Emu2-Chat',
16 |                  **kwargs):
17 | 
18 |         self.model_path = model_path
19 |         assert osp.exists(model_path) or splitlen(model_path) == 2
20 | 
21 |         from transformers import AutoModelForCausalLM, AutoTokenizer
22 |         from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
23 | 
24 |         local_rank = os.environ.get('LOCAL_RANK', 0)
25 | 
26 |         device_num = torch.cuda.device_count()
27 |         assert local_rank * 2 <= device_num, 'The number of devices does not match the world size'
28 |         assert device_num >= 2, 'You need at least 2 GPUs to use EMU'
29 | 
30 |         device_1 = local_rank
31 |         device_2 = local_rank + device_num // 2
32 | 
33 |         torch.cuda.set_device(device_1)
34 |         torch.cuda.set_device(device_2)
35 | 
36 |         tokenizer = AutoTokenizer.from_pretrained(model_path)  # "BAAI/Emu2-Chat"
37 |         self.tokenizer = tokenizer
38 |         with init_empty_weights():
39 |             model = AutoModelForCausalLM.from_pretrained(
40 |                 model_path,  # "BAAI/Emu2-Chat"
41 |                 torch_dtype=torch.bfloat16,
42 |                 low_cpu_mem_usage=True,
43 |                 trust_remote_code=True)
44 | 
45 |         device_map = infer_auto_device_map(
46 |             model,
47 |             max_memory={
48 |                 device_1: '38GiB',
49 |                 device_2: '38GiB'
50 |             },
51 |             no_split_module_classes=['Block', 'LlamaDecoderLayer'])
52 | 
53 |         # input and output logits should be on same device
54 |         device_map['model.decoder.lm.lm_head'] = device_1
55 | 
56 |         model = dispatch_model(
57 |             model,
58 |             device_map=device_map).eval()
59 | 
60 |         self.model = model
61 |         kwargs_default = dict(max_new_tokens=512, length_penalty=-1)
62 |         kwargs_default.update(kwargs)
63 |         self.kwargs = kwargs_default
64 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
65 | 
66 |     def generate_inner(self, message, dataset=None):
67 |         query, images = '', []
68 |         for item in message:
69 |             if item['type'] == 'image':
70 |                 images.append(Image.open(item['value']).convert('RGB'))
71 |                 query += '[<IMG_PLH>]'
72 |             elif item['type'] == 'text':
73 |                 query += item['value']
74 | 
75 |         inputs = self.model.build_input_ids(
76 |             text=[query],
77 |             tokenizer=self.tokenizer,
78 |             image=images
79 |         )
80 | 
81 |         with torch.no_grad():
82 |             outputs = self.model.generate(
83 |                 input_ids=inputs['input_ids'],
84 |                 attention_mask=inputs['attention_mask'],
85 |                 image=inputs['image'].to(torch.bfloat16),
86 |                 **self.kwargs)
87 | 
88 |         output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
89 |         return output_text[0]
90 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import os.path as osp
 4 | import sys
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class InstructBLIP(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name):
15 |         self.config_map = {
16 |             'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 |             'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 |         }
19 | 
20 |         self.file_path = __file__
21 |         config_root = osp.dirname(self.file_path)
22 | 
23 |         try:
24 |             from lavis.models import load_preprocess
25 |             from omegaconf import OmegaConf
26 |             from lavis.common.registry import registry
27 |         except:
28 |             warnings.warn('Please install lavis before using InstructBLIP. ')
29 |             sys.exit(-1)
30 | 
31 |         assert name in self.config_map
32 |         cfg_path = osp.join(config_root, self.config_map[name])
33 |         cfg = OmegaConf.load(cfg_path)
34 | 
35 |         model_cfg = cfg.model
36 |         assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 |         model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 |         model = model_cls.from_config(model_cfg)
39 |         model.eval()
40 | 
41 |         self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 |         device = self.device
43 |         model.to(device)
44 |         self.model = model
45 |         self.kwargs = {'max_length': 512}
46 | 
47 |         preprocess_cfg = cfg.preprocess
48 |         vis_processors, _ = load_preprocess(preprocess_cfg)
49 |         self.vis_processors = vis_processors
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message)
53 |         vis_processors = self.vis_processors
54 |         raw_image = Image.open(image_path).convert('RGB')
55 |         image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 |         outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 |         return outputs[0]
58 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next
2 | from .llava_xtuner import LLaVA_XTuner
3 | 
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner']
5 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/minigpt4.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import os.path as osp
 4 | import warnings
 5 | from transformers import StoppingCriteriaList
 6 | from .base import BaseModel
 7 | 
 8 | 
 9 | class MiniGPT4(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self,
15 |                  mode='v2',
16 |                  root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
17 |                  temperature=1,
18 |                  max_out_len=512):
19 | 
20 |         if root is None:
21 |             warnings.warn(
22 |                 'Please set root to the directory of MiniGPT-4, which is cloned from here: '
23 |                 'https://github.com/Vision-CAIR/MiniGPT-4. '
24 |             )
25 | 
26 |         if mode == 'v2':
27 |             cfg = 'minigptv2_eval.yaml'
28 |         elif mode == 'v1_7b':
29 |             cfg = 'minigpt4_7b_eval.yaml'
30 |         elif mode == 'v1_13b':
31 |             cfg = 'minigpt4_13b_eval.yaml'
32 |         else:
33 |             raise NotImplementedError
34 | 
35 |         self.mode = mode
36 |         self.temperature = temperature
37 |         self.max_out_len = max_out_len
38 |         self.root = root
39 |         this_dir = osp.dirname(__file__)
40 | 
41 |         self.cfg = osp.join(this_dir, 'misc', cfg)
42 |         sys.path.append(self.root)
43 | 
44 |         from omegaconf import OmegaConf
45 |         from minigpt4.common.registry import registry
46 |         from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
47 | 
48 |         device = torch.cuda.current_device()
49 |         self.device = device
50 | 
51 |         cfg_path = self.cfg
52 |         cfg = OmegaConf.load(cfg_path)
53 | 
54 |         model_cfg = cfg.model
55 |         model_cfg.device_8bit = device
56 |         model_cls = registry.get_model_class(model_cfg.arch)
57 |         model = model_cls.from_config(model_cfg)
58 |         model = model.to(device)
59 |         model.eval()
60 |         vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
61 |         vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
62 |         self.model = model
63 |         self.vis_processor = vis_processor
64 | 
65 |         self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
66 |         stop_words_ids = [[835], [2277, 29937]]
67 |         stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
68 |         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
69 | 
70 |     def generate_inner(self, message, dataset=None):
71 |         from minigpt4.conversation.conversation import Chat
72 |         prompt, image_path = self.message_to_promptimg(message)
73 |         if self.mode == 'v2':
74 |             chat = Chat(self.model, self.vis_processor, device=self.device)
75 |         else:
76 |             chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
77 | 
78 |         chat_state = self.CONV_VISION.copy()
79 |         img_list = []
80 |         _ = chat.upload_img(image_path, chat_state, img_list)
81 |         chat.encode_img(img_list)
82 |         chat.ask(prompt, chat_state)
83 |         with torch.inference_mode():
84 |             msg = chat.answer(conv=chat_state, img_list=img_list)[0]
85 |         return msg
86 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-13b-v0"
25 | 
26 | datasets:
27 |   cc_sbu_align:
28 |     vis_processor:
29 |       train:
30 |         name: "blip2_image_eval"
31 |         image_size: 224
32 |     text_processor:
33 |       train:
34 |         name: "blip_caption"
35 | 
36 | run:
37 |   task: image_text_pretrain
38 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-7b-v0"
25 | 
26 | 
27 | datasets:
28 |   cc_sbu_align:
29 |     vis_processor:
30 |       train:
31 |         name: "blip2_image_eval"
32 |         image_size: 224
33 |     text_processor:
34 |       train:
35 |         name: "blip_caption"
36 | 
37 | run:
38 |   task: image_text_pretrain
39 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt_v2
 3 |   model_type: pretrain
 4 |   max_txt_len: 160
 5 |   end_sym: "</s>"
 6 |   low_resource: True
 7 |   prompt_template: '[INST] {} [/INST]'
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 |   lora_r: 64
10 |   lora_alpha: 16
11 | 
12 |   # vit encoder
13 |   image_size: 448
14 |   drop_path_rate: 0
15 |   use_grad_checkpoint: False
16 |   vit_precision: "fp16"
17 |   freeze_vit: True
18 | 
19 |   # generation configs
20 |   prompt: ""
21 | 
22 |   # LLM
23 |   llama_model: "please set this value to the path of llama2-chat-7b"
24 | 
25 | datasets:
26 |   cc_sbu_align:
27 |     vis_processor:
28 |       train:
29 |         name: "blip2_image_eval"
30 |         image_size: 448
31 |     text_processor:
32 |       train:
33 |         name: "blip_caption"
34 | 
35 | run:
36 |   task: image_text_pretrain
37 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/mmalaya.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | from PIL import Image
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class MMAlaya(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
14 |         assert model_path is not None
15 |         self.model_path = model_path
16 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
17 |         model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
18 |         # need initialize tokenizer
19 |         model.initialize_tokenizer(self.tokenizer)
20 |         self.model = model.cuda()
21 | 
22 |         self.kwargs = kwargs
23 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
24 |         torch.cuda.empty_cache()
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         # read image
28 |         prompt, image_path = self.message_to_promptimg(message)
29 |         image = Image.open(image_path).convert('RGB')
30 |         # tokenize prompt, and proprecess image
31 |         input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
32 |             prompt,
33 |             self.tokenizer,
34 |             image,
35 |             return_tensors='pt')
36 |         with torch.inference_mode():
37 |             output_ids = self.model.generate(
38 |                 inputs=input_ids.cuda(),
39 |                 images=image_tensor.cuda(),
40 |                 do_sample=False,
41 |                 max_new_tokens=512,
42 |                 num_beams=1,
43 |                 use_cache=True,
44 |                 stopping_criteria=[stopping_criteria],
45 |             )
46 |             # truncate input_ids in generate_ids and then decode to text
47 |             input_token_len = input_ids.shape[1]
48 |             response = self.tokenizer.batch_decode(
49 |                 output_ids[:, input_token_len:].cpu(),
50 |                 skip_special_tokens=True,
51 |                 clean_up_tokenization_spaces=False
52 |             )[0].strip()
53 |         return response
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     model = MMAlaya()
58 |     response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。'])
59 |     print(response)
60 | 
61 | """
62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit
63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py
64 | """
65 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/open_flamingo.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch
  3 | from PIL import Image
  4 | import os.path as osp
  5 | import warnings
  6 | from .base import BaseModel
  7 | from ..smp import splitlen, get_cache_path
  8 | from huggingface_hub import snapshot_download
  9 | 
 10 | 
 11 | class OpenFlamingo(BaseModel):
 12 | 
 13 |     INSTALL_REQ = True
 14 |     INTERLEAVE = True
 15 | 
 16 |     def __init__(self,
 17 |                  name,
 18 |                  mpt_pth=None,
 19 |                  ckpt_pth=None,
 20 |                  **kwargs):
 21 | 
 22 |         if mpt_pth is None:
 23 |             warnings.warn(
 24 |                 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
 25 |                 'https://huggingface.co/mosaicml/mpt-7b. '
 26 |             )
 27 |             sys.exit(-1)
 28 |         if ckpt_pth is None:
 29 |             warnings.warn(
 30 |                 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
 31 |                 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
 32 |             )
 33 |             sys.exit(-1)
 34 |         else:
 35 |             if osp.exists(ckpt_pth):
 36 |                 if ckpt_pth.endswith('checkpoint.pt'):
 37 |                     pass
 38 |                 elif osp.isdir(ckpt_pth):
 39 |                     ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
 40 |                     if not osp.exists(ckpt_pth):
 41 |                         sys.exit(-1)
 42 |             elif splitlen(ckpt_pth, '/') == 2:
 43 |                 cache_path = get_cache_path(ckpt_pth)
 44 |                 if cache_path is None:
 45 |                     snapshot_download(ckpt_pth)
 46 |                 cache_path = get_cache_path(ckpt_pth)
 47 |                 if cache_path is None:
 48 |                     sys.exit(-1)
 49 |                 else:
 50 |                     ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
 51 | 
 52 |         self.name = name
 53 |         assert name in ['v2']
 54 |         self.mpt_pth = mpt_pth
 55 |         try:
 56 |             from open_flamingo import create_model_and_transforms
 57 |         except:
 58 |             raise ImportError('Please first install open_flamingo to use OpenFlamingo')
 59 |         model, image_processor, tokenizer = create_model_and_transforms(
 60 |             clip_vision_encoder_path='ViT-L-14',
 61 |             clip_vision_encoder_pretrained='openai',
 62 |             lang_encoder_path=mpt_pth,
 63 |             tokenizer_path=mpt_pth,
 64 |             cross_attn_every_n_layers=4)
 65 |         ckpt = torch.load(ckpt_pth)
 66 |         model.load_state_dict(ckpt, strict=False)
 67 |         torch.cuda.empty_cache()
 68 |         self.model = model.eval().cuda()
 69 |         self.tokenizer = tokenizer
 70 |         self.tokenizer.padding_side = 'left'
 71 |         self.image_proc = image_processor
 72 | 
 73 |         kwargs_default = dict(max_new_tokens=512, num_beams=3)
 74 |         kwargs_default.update(kwargs)
 75 |         self.kwargs = kwargs_default
 76 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
 77 | 
 78 |     def generate_inner(self, message, dataset=None):
 79 |         vision_x = []
 80 |         prompt = ''
 81 |         for msg in message:
 82 |             if msg['type'] == 'image':
 83 |                 img = Image.open(msg['value'])
 84 |                 vision_x.append(self.image_proc(img).unsqueeze(0))
 85 |                 prompt += '<image>'
 86 |             elif msg['type'] == 'text':
 87 |                 prompt += msg['value']
 88 |         prompt += 'Answer: '
 89 |         vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
 90 |         vision_x = vision_x.unsqueeze(1).unsqueeze(0)
 91 |         lang_x = self.tokenizer([prompt], return_tensors='pt')
 92 |         generated_text = self.model.generate(
 93 |             vision_x=vision_x.cuda(),
 94 |             lang_x=lang_x['input_ids'].cuda(),
 95 |             attention_mask=lang_x['attention_mask'].cuda(),
 96 |             **self.kwargs)
 97 |         generated_text = self.tokenizer.decode(generated_text[0])
 98 |         text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
 99 |         return text
100 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/paligemma.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class PaliGemma(BaseModel):
 9 |     INSTALL_REQ = False
10 |     INTERLEAVE = False
11 | 
12 |     def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
13 |         try:
14 |             from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
15 |         except:
16 |             warnings.warn('Please install the latest version transformers.')
17 |             sys.exit(-1)
18 |         model = PaliGemmaForConditionalGeneration.from_pretrained(
19 |             model_path,
20 |             torch_dtype=torch.bfloat16,
21 |             device_map='cpu',
22 |             revision='bfloat16',
23 |         ).eval()
24 |         self.model = model.cuda()
25 |         self.processor = AutoProcessor.from_pretrained(model_path)
26 |         self.kwargs = kwargs
27 | 
28 |     def generate_inner(self, message, dataset=None):
29 |         prompt, image_path = self.message_to_promptimg(message)
30 |         image = Image.open(image_path).convert('RGB')
31 | 
32 |         model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
33 |         input_len = model_inputs['input_ids'].shape[-1]
34 | 
35 |         with torch.inference_mode():
36 |             generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
37 |             generation = generation[0][input_len:]
38 |             res = self.processor.decode(generation, skip_special_tokens=True)
39 |         return res
40 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/pandagpt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import os.path as osp
 4 | import warnings
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class PandaGPT(BaseModel):
 9 | 
10 |     INSTALL_REQ = True
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, name, root=None, **kwargs):
14 |         if root is None:
15 |             warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ')
16 |             sys.exit(-1)
17 | 
18 |         assert name == 'PandaGPT_13B'
19 |         self.name = name
20 |         sys.path.append(osp.join(root, 'code'))
21 |         try:
22 |             from model.openllama import OpenLLAMAPEFTModel
23 |         except:
24 |             raise ImportError(
25 |                 'Please first install PandaGPT and set the root path to use PandaGPT, '
26 |                 'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
27 |             )
28 |         self.args = {
29 |             'model': 'openllama_peft',
30 |             'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
31 |             'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
32 |             'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
33 |             'stage': 2,
34 |             'max_tgt_len': 512,
35 |             'lora_r': 32,
36 |             'lora_alpha': 32,
37 |             'lora_dropout': 0.1,
38 |         }
39 |         model = OpenLLAMAPEFTModel(**self.args)
40 |         delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
41 |         model.load_state_dict(delta_ckpt, strict=False)
42 |         torch.cuda.empty_cache()
43 |         self.model = model.eval().half().cuda()
44 |         kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
45 |         kwargs_default.update(kwargs)
46 |         self.kwargs = kwargs_default
47 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
48 | 
49 |     def generate_inner(self, message, dataset=None):
50 |         prompt, image_path = self.message_to_promptimg(message)
51 |         struct = {
52 |             'prompt': prompt,
53 |             'image_paths': [image_path],
54 |             'audio_paths': [],
55 |             'video_paths': [],
56 |             'thermal_paths': [],
57 |             'modality_embeds': []
58 |         }
59 |         struct.update(self.kwargs)
60 |         resp = self.model.generate(struct)
61 |         return resp
62 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/phi3_vision.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class Phi3Vision(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
14 |         try:
15 |             from transformers import AutoProcessor, AutoModelForCausalLM
16 |         except:
17 |             warnings.warn('Please install the latest version transformers.')
18 |             sys.exit(-1)
19 |         model = AutoModelForCausalLM.from_pretrained(
20 |             model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
21 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
22 |         self.model = model
23 |         self.processor = processor
24 |         self.kwargs = kwargs
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         prompt, image_path = self.message_to_promptimg(message)
28 |         image = Image.open(image_path).convert('RGB')
29 |         messages = [
30 |             {'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
31 |         ]
32 |         prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33 |         inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
34 | 
35 |         generation_args = {
36 |             'max_new_tokens': 500,
37 |             'temperature': 0.0,
38 |             'do_sample': False,
39 |         }
40 |         generation_args.update(self.kwargs)
41 | 
42 |         generate_ids = self.model.generate(
43 |             **inputs,
44 |             eos_token_id=self.processor.tokenizer.eos_token_id,
45 |             **generation_args
46 |         )
47 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
48 |         response = self.processor.batch_decode(
49 |             generate_ids,
50 |             skip_special_tokens=True,
51 |             clean_up_tokenization_spaces=False
52 |         )[0]
53 |         return response
54 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/qh_360vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import os.path as osp
 5 | from PIL import Image
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..utils import DATASET_TYPE
 9 | 
10 | 
11 | class QH_360VL(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
17 |         assert model_path is not None
18 |         self.model_path = model_path
19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20 |         self.model = AutoModelForCausalLM.from_pretrained(model_path,
21 |                                                           torch_dtype=torch.float16,
22 |                                                           low_cpu_mem_usage=True,
23 |                                                           device_map='auto',
24 |                                                           trust_remote_code=True).eval()
25 |         vision_tower = self.model.get_vision_tower()
26 |         vision_tower.load_model()
27 |         vision_tower.to(device='cuda', dtype=torch.float16)
28 |         self.image_processor = vision_tower.image_processor
29 |         self.tokenizer.pad_token = self.tokenizer.eos_token
30 |         self.kwargs = kwargs
31 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
32 |         torch.cuda.empty_cache()
33 | 
34 |     def generate(self, message, dataset=None):
35 | 
36 |         prompt, image_path = self.message_to_promptimg(message)
37 |         print(prompt)
38 |         image = Image.open(image_path).convert('RGB')
39 |         terminators = [
40 |             self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
41 |         ]
42 |         inputs = self.model.build_conversation_input_ids(self.tokenizer,
43 |                                                          query=prompt,
44 |                                                          image=image,
45 |                                                          image_processor=self.image_processor)
46 |         input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
47 |         images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
48 | 
49 |         output_ids = self.model.generate(input_ids=input_ids,
50 |                                          images=images,
51 |                                          do_sample=False,
52 |                                          num_beams=1,
53 |                                          max_new_tokens=512,
54 |                                          eos_token_id=terminators,
55 |                                          use_cache=True)
56 | 
57 |         input_token_len = input_ids.shape[1]
58 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
59 |         response = outputs.strip()
60 | 
61 |         return response
62 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/qwen_vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import copy as cp
 5 | from .base import BaseModel
 6 | from ..smp import isimg, listinstr
 7 | from ..utils import DATASET_TYPE
 8 | 
 9 | 
10 | class QwenVL(BaseModel):
11 | 
12 |     INSTALL_REQ = False
13 |     INTERLEAVE = True
14 | 
15 |     def __init__(self, model_path='Qwen/Qwen-VL', **kwargs):
16 |         assert model_path is not None
17 |         self.model_path = model_path
18 |         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
19 |         tokenizer.padding_side = 'left'
20 |         tokenizer.pad_token_id = tokenizer.eod_id
21 |         self.tokenizer = tokenizer
22 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
23 |         default_kwargs = dict(
24 |             do_sample=False,
25 |             num_beams=1,
26 |             max_new_tokens=512,
27 |             min_new_tokens=1,
28 |             num_return_sequences=1,
29 |             use_cache=True,
30 |             output_hidden_states=True,
31 |             pad_token_id=tokenizer.eod_id,
32 |             eos_token_id=tokenizer.eod_id)
33 |         default_kwargs.update(kwargs)
34 |         self.kwargs = default_kwargs
35 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
36 |         torch.cuda.empty_cache()
37 | 
38 |     def adjust_kwargs(self, dataset):
39 |         kwargs = cp.deepcopy(self.kwargs)
40 |         if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
41 |             kwargs['max_new_tokens'] = 32
42 |         elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
43 |             kwargs['max_new_tokens'] = 32
44 |         elif DATASET_TYPE(dataset) == 'VQA':
45 |             if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
46 |                 kwargs['max_new_tokens'] = 100
47 |             elif listinstr(['TextVQA'], dataset):
48 |                 kwargs['max_new_tokens'] = 10
49 |         return kwargs
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         if dataset is not None:
53 |             kwargs = self.adjust_kwargs(dataset)
54 |         else:
55 |             kwargs = self.kwargs
56 |         prompt = ''
57 |         for s in message:
58 |             if s['type'] == 'image':
59 |                 prompt += f'<img>{s["value"]}</img>'
60 |             elif s['type'] == 'text':
61 |                 prompt += s['value']
62 |         if dataset is not None and DATASET_TYPE(dataset) == 'VQA':
63 |             prompt += ' Answer:'
64 |         encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest')
65 |         input_ids = encoded.input_ids.to('cuda')
66 |         attention_mask = encoded.attention_mask.to('cuda')
67 | 
68 |         pred = self.model.generate(
69 |             input_ids=input_ids,
70 |             attention_mask=attention_mask,
71 |             **kwargs)
72 |         answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
73 |         return answer
74 | 
75 | 
76 | class QwenVLChat(BaseModel):
77 | 
78 |     INSTALL_REQ = False
79 |     INTERLEAVE = True
80 | 
81 |     def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs):
82 |         assert model_path is not None
83 |         self.model_path = model_path
84 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
85 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
86 |         torch.cuda.empty_cache()
87 |         self.kwargs = kwargs
88 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
89 | 
90 |     def generate_inner(self, message, dataset=None):
91 |         vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message]
92 |         query = self.tokenizer.from_list_format(vl_list)
93 |         response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs)
94 |         return response
95 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from .base import BaseModel
 3 | from ..smp import *
 4 | 
 5 | 
 6 | class VisualGLM(BaseModel):
 7 | 
 8 |     INSTALL_REQ = False
 9 |     INTERLEAVE = False
10 | 
11 |     def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 |         try:
13 |             import sat
14 |         except:
15 |             warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
16 |         assert model_path is not None
17 |         self.model_path = model_path
18 | 
19 |         from transformers import AutoModel
20 |         from transformers import AutoTokenizer
21 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
22 |         model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
23 |         self.model = model
24 |         self.kwargs = kwargs
25 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
26 | 
27 |     def generate_inner(self, message, dataset=None):
28 |         prompt, image_path = self.message_to_promptimg(message)
29 |         output, _ = self.model.chat(
30 |             image_path=image_path,
31 |             tokenizer=self.tokenizer,
32 |             query=prompt,
33 |             history=[],
34 |             **self.kwargs
35 |         )
36 |         return output
37 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/wemm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import sys
 4 | from ..smp import *
 5 | from .base import BaseModel
 6 | from ..utils import DATASET_TYPE
 7 | from transformers import AutoModel, GenerationConfig
 8 | 
 9 | 
10 | class WeMM(BaseModel):
11 |     def __init__(self, model_path='feipengma/WeMM', **kwargs):
12 |         self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
13 |         self.wemm.cuda()
14 |         self.wemm.eval()
15 |         torch.cuda.empty_cache()
16 | 
17 |     def use_custom_prompt(self, dataset):
18 |         assert dataset is not None
19 |         if DATASET_TYPE(dataset) == 'multi-choice':
20 |             return True
21 |         return False
22 | 
23 |     def build_prompt(self, line, dataset=None):
24 |         assert self.use_custom_prompt(dataset)
25 |         assert dataset is None or isinstance(dataset, str)
26 |         tgt_path = self.dump_image(line, dataset)
27 |         question = line['question']
28 |         hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
29 |         if hint is not None:
30 |             question = hint + '\n' + question
31 |         options = {
32 |             cand: line[cand]
33 |             for cand in string.ascii_uppercase
34 |             if cand in line and not pd.isna(line[cand])
35 |         }
36 |         for key, item in options.items():
37 |             question += f'\n{key}. {item}'
38 |         prompt = question
39 | 
40 |         if len(options):
41 |             prompt += (
42 |                 '\n请直接回答选项字母。' if cn_string(prompt) else
43 |                 "\nAnswer with the option's letter from the given choices directly."
44 |             )
45 |         else:
46 |             prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
47 | 
48 |         message = [dict(type='text', value=prompt)]
49 |         message.extend([dict(type='image', value=p) for p in tgt_path])
50 |         return message
51 | 
52 |     def generate_inner(self, message, dataset=None):
53 |         prompt, image_path = self.message_to_promptimg(message)
54 | 
55 |         if dataset == 'HallusionBench':
56 |             prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
57 | 
58 |         gen_config = None
59 |         if dataset == 'MMVet':
60 |             gen_config = GenerationConfig(
61 |                 max_new_tokens=512,
62 |                 do_sample=True,
63 |                 temperatures=0.7,
64 |                 num_beams=3,
65 |                 eos_token_id=self.wemm.tokenizer.eos_token_id,
66 |                 pad_token_id=self.wemm.tokenizer.pad_token_id
67 |                 if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
68 |             )
69 |         pred = self.wemm.mm_generate(image_path, prompt, gen_config)
70 | 
71 |         return pred
72 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 | 
6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD']
7 | 


--------------------------------------------------------------------------------
/VLMEvalKit-main/vlmeval/vlm/xcomposer/sharecaptioner.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | from ..base import BaseModel
 4 | from ...smp import *
 5 | from ...utils import DATASET_TYPE
 6 | 
 7 | 
 8 | class ShareCaptioner(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
14 |         assert model_path is not None
15 |         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
16 |         self.model = AutoModelForCausalLM.from_pretrained(
17 |             model_path, device_map='cuda', trust_remote_code=True).eval()
18 |         self.model.tokenizer = tokenizer
19 |         self.model.cuda()
20 |         self.model.half()
21 | 
22 |     def use_custom_prompt(self, dataset):
23 |         assert dataset is not None
24 |         if DATASET_TYPE(dataset) == 'multi-choice':
25 |             return True
26 |         return False
27 | 
28 |     def build_prompt(self, line, dataset=None):
29 |         assert dataset is None or isinstance(dataset, str)
30 |         assert self.use_custom_prompt(dataset)
31 |         tgt_path = self.dump_image(line, dataset)
32 | 
33 |         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
34 |             question = line['question']
35 |             hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
36 |             if hint is not None:
37 |                 question = hint + '\n' + question
38 | 
39 |             option_candidate = string.ascii_uppercase
40 |             options = {
41 |                 cand: line[cand]
42 |                 for cand in option_candidate
43 |                 if cand in line and not pd.isna(line[cand])
44 |             }
45 |             for key, item in options.items():
46 |                 question += f'\n{key}. {item}'
47 |             prompt = question
48 | 
49 |             if not cn_string(prompt):
50 |                 prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
51 |             else:
52 |                 prompt = prompt + '\n' + '请直接回答选项字母。'
53 |         else:
54 |             prompt = line['question']
55 |         message = [dict(type='text', value=prompt)]
56 |         message.extend([dict(type='image', value=s) for s in tgt_path])
57 |         return message
58 | 
59 |     def generate_inner(self, message, dataset=None):
60 |         prompt, image_path = self.message_to_promptimg(message)
61 |         seg1 = '<|User|>:'
62 |         seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
63 |         self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
64 |         self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
65 | 
66 |         image = Image.open(image_path).convert('RGB')
67 |         image = self.model.vis_processor(image).unsqueeze(0)
68 |         image = image.to(self.model.device)
69 |         tmp_bs = image.shape[0]
70 |         tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
71 |         tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
72 |         with torch.cuda.amp.autocast():
73 |             with torch.no_grad():
74 |                 image = self.model.encode_img(image)
75 |                 input_emb = torch.cat(
76 |                     [tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
77 |                 out_embeds = self.model.internlm_model.generate(
78 |                     inputs_embeds=input_emb,
79 |                     max_length=500,
80 |                     num_beams=3,
81 |                     min_length=1,
82 |                     do_sample=True,
83 |                     repetition_penalty=1.5,
84 |                     length_penalty=1.0,
85 |                     temperature=1.,
86 |                     eos_token_id=self.model.tokenizer.eos_token_id,
87 |                     num_return_sequences=1)
88 | 
89 |         for j, out in enumerate(out_embeds):
90 |             out[out == -1] = 2
91 |             response = self.model.decode_text([out])
92 |         return response
93 | 


--------------------------------------------------------------------------------
/assets/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/apple.jpg


--------------------------------------------------------------------------------
/assets/metatask_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/metatask_eval.png


--------------------------------------------------------------------------------
/assets/overall_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/overall_progress.png


--------------------------------------------------------------------------------
/assets/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenGVLab/MMT-Bench/84012c95e31c2986521ea5b7c16a88e36e9958c2/assets/overview.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops
 2 | gradio==4.15.0
 3 | huggingface_hub
 4 | matplotlib
 5 | numpy>=1.23.4
 6 | omegaconf
 7 | openai==1.3.5
 8 | opencv-python>=4.4.0.46
 9 | openpyxl
10 | pandas>=1.5.3
11 | pillow
12 | portalocker
13 | protobuf
14 | pycocoevalcap
15 | python-dotenv
16 | requests
17 | rich
18 | seaborn
19 | sentencepiece
20 | sty
21 | tabulate
22 | tiktoken
23 | timeout-decorator
24 | torch>=2.0.1
25 | tqdm
26 | transformers
27 | typing_extensions==4.7.1
28 | validators
29 | visual_genome
30 | xlsxwriter
31 | xtuner
32 | 


--------------------------------------------------------------------------------
/vlmeval/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | except ImportError:
 4 |     pass
 5 | 
 6 | from .smp import *
 7 | from .api import *
 8 | from .evaluate import *
 9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | from .tools import cli
13 | 
14 | load_env()
15 | 


--------------------------------------------------------------------------------
/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gpt import OpenAIWrapper, GPT4V
 2 | from .gpt_int import OpenAIWrapperInternal, GPT4V_Internal
 3 | from .hf_chat_model import HFChatModel
 4 | from .gemini import GeminiWrapper, GeminiProVision
 5 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI
 6 | from .qwen_api import QwenAPI
 7 | from .stepai import Step1V_INT
 8 | from .claude import Claude_Wrapper, Claude3V
 9 | from .reka import Reka
10 | from .glm_vision import GLMVisionAPI
11 | from .cloudwalk import CWWrapper
12 | 
13 | __all__ = [
14 |     'OpenAIWrapper', 'HFChatModel', 'OpenAIWrapperInternal', 'GeminiWrapper',
15 |     'GPT4V', 'GPT4V_Internal', 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI',
16 |     'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'Step1V_INT', 'GLMVisionAPI',
17 |     'CWWrapper'
18 | ]
19 | 


--------------------------------------------------------------------------------
/vlmeval/api/claude.py:
--------------------------------------------------------------------------------
  1 | from vlmeval.smp import *
  2 | from vlmeval.api.base import BaseAPI
  3 | from time import sleep
  4 | import base64
  5 | import mimetypes
  6 | 
  7 | url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
  8 | headers = {
  9 |     'alles-apin-token': '',
 10 |     'Content-Type': 'application/json'
 11 | }
 12 | 
 13 | 
 14 | class Claude_Wrapper(BaseAPI):
 15 | 
 16 |     is_api: bool = True
 17 | 
 18 |     def __init__(self,
 19 |                  model: str = 'claude-3-opus-20240229',
 20 |                  key: str = None,
 21 |                  retry: int = 10,
 22 |                  wait: int = 3,
 23 |                  system_prompt: str = None,
 24 |                  verbose: bool = True,
 25 |                  temperature: float = 0,
 26 |                  max_tokens: int = 1024,
 27 |                  **kwargs):
 28 | 
 29 |         self.model = model
 30 |         self.headers = headers
 31 |         self.temperature = temperature
 32 |         self.max_tokens = max_tokens
 33 |         if key is not None:
 34 |             self.key = key
 35 |         else:
 36 |             self.key = os.environ.get('ALLES', '')
 37 |         self.headers['alles-apin-token'] = self.key
 38 | 
 39 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
 40 | 
 41 |     def build_msgs(self, msgs_raw):
 42 | 
 43 |         messages = []
 44 |         message = {'role': 'user', 'content': []}
 45 |         for msg in msgs_raw:
 46 |             if msg['type'] == 'image':
 47 |                 pth = msg['value']
 48 |                 suffix = osp.splitext(pth)[-1].lower()
 49 |                 media_type = mimetypes.types_map.get(suffix, None)
 50 |                 assert media_type is not None
 51 | 
 52 |                 item = {
 53 |                     'type': 'image',
 54 |                     'source': {'type': 'base64', 'media_type': media_type, 'data': encode_image_file_to_base64(pth)}
 55 |                 }
 56 | 
 57 |             elif msg['type'] == 'text':
 58 |                 item = {'type': 'text', 'text': msg['value']}
 59 |             else:
 60 |                 raise NotImplementedError(f'Unsupported message type: {msg["type"]}')
 61 | 
 62 |             message['content'].append(item)
 63 |         messages.append(message)
 64 |         return messages
 65 | 
 66 |     def generate_inner(self, inputs, **kwargs) -> str:
 67 | 
 68 |         payload = json.dumps({
 69 |             'model': self.model,
 70 |             'max_tokens': self.max_tokens,
 71 |             'messages': self.build_msgs(msgs_raw=inputs),
 72 |             **kwargs
 73 |         })
 74 |         response = requests.request('POST', url, headers=headers, data=payload)
 75 | 
 76 |         ret_code = response.status_code
 77 |         retry = self.retry
 78 |         while ret_code == 429 and retry > 0:
 79 |             sleep(15)
 80 |             response = requests.request('POST', url, headers=headers, data=payload)
 81 |             ret_code = response.status_code
 82 |             retry -= 1
 83 | 
 84 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
 85 |         answer = self.fail_msg
 86 | 
 87 |         try:
 88 |             resp_struct = json.loads(response.text)
 89 |             answer = resp_struct['data']['content'][0]['text'].strip()
 90 |         except:
 91 |             pass
 92 | 
 93 |         return ret_code, answer, response
 94 | 
 95 | 
 96 | class Claude3V(Claude_Wrapper):
 97 | 
 98 |     def generate(self, message, dataset=None):
 99 |         return super(Claude_Wrapper, self).generate(message)
100 | 


--------------------------------------------------------------------------------
/vlmeval/api/cloudwalk.py:
--------------------------------------------------------------------------------
  1 | from ..smp import *
  2 | import os
  3 | from .base import BaseAPI
  4 | 
  5 | 
  6 | class CWWrapper(BaseAPI):
  7 | 
  8 |     is_api: bool = True
  9 | 
 10 |     def __init__(self,
 11 |                  model: str = 'cw-congrong-v1.5',
 12 |                  retry: int = 10,
 13 |                  wait: int = 5,
 14 |                  key: str = None,
 15 |                  verbose: bool = True,
 16 |                  system_prompt: str = None,
 17 |                  temperature: float = 0,
 18 |                  timeout: int = 600,
 19 |                  api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
 20 |                  max_tokens: int = 1024,
 21 |                  img_size: int = 512,
 22 |                  img_detail: str = 'low',
 23 |                  **kwargs):
 24 | 
 25 |         self.model = model
 26 |         self.cur_idx = 0
 27 |         self.fail_msg = 'Failed to obtain answer via API. '
 28 |         self.max_tokens = max_tokens
 29 |         self.temperature = temperature
 30 | 
 31 |         base = os.environ.get('CW_API_BASE', None)
 32 |         self.api_base = base if base is not None else api_base
 33 | 
 34 |         env_key = os.environ.get('CW_API_KEY', None)
 35 |         self.key = env_key if env_key is not None else key
 36 |         assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
 37 |             pass it to the constructor.'
 38 | 
 39 |         assert img_size > 0 or img_size == -1
 40 |         self.img_size = -1  # allways send full size image
 41 |         assert img_detail in ['high', 'low']
 42 |         self.img_detail = img_detail
 43 | 
 44 |         self.vision = True
 45 |         self.timeout = timeout
 46 | 
 47 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
 48 | 
 49 |     # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
 50 |     # content can be a string or a list of image & text
 51 |     def prepare_inputs(self, inputs):
 52 |         input_msgs = []
 53 |         if self.system_prompt is not None:
 54 |             input_msgs.append(dict(role='system', content=self.system_prompt))
 55 |         has_images = np.sum([x['type'] == 'image' for x in inputs])
 56 |         if has_images:
 57 |             content_list = []
 58 |             for msg in inputs:
 59 |                 if msg['type'] == 'text':
 60 |                     content_list.append(dict(type='text', text=msg['value']))
 61 |                 elif msg['type'] == 'image':
 62 |                     from PIL import Image
 63 |                     img = Image.open(msg['value'])
 64 |                     b64 = encode_image_to_base64(img, target_size=self.img_size)
 65 |                     img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
 66 |                     content_list.append(dict(type='image_url', image_url=img_struct))
 67 |             input_msgs.append(dict(role='user', content=content_list))
 68 |         else:
 69 |             assert all([x['type'] == 'text' for x in inputs])
 70 |             text = '\n'.join([x['value'] for x in inputs])
 71 |             input_msgs.append(dict(role='user', content=text))
 72 |         return input_msgs
 73 | 
 74 |     def generate_inner(self, inputs, **kwargs) -> str:
 75 |         input_msgs = self.prepare_inputs(inputs)
 76 |         temperature = kwargs.pop('temperature', self.temperature)
 77 |         max_tokens = kwargs.pop('max_tokens', self.max_tokens)
 78 | 
 79 |         if 0 < max_tokens <= 100:
 80 |             self.logger.warning(
 81 |                 'Less than 100 tokens left, '
 82 |                 'may exceed the context window with some additional meta symbols. '
 83 |             )
 84 |         if max_tokens <= 0:
 85 |             return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
 86 | 
 87 |         headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
 88 |         payload = dict(
 89 |             model=self.model,
 90 |             messages=input_msgs,
 91 |             max_tokens=max_tokens,
 92 |             n=1,
 93 |             temperature=temperature,
 94 |             **kwargs)
 95 |         response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
 96 |         ret_code = response.status_code
 97 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
 98 |         answer = self.fail_msg
 99 |         try:
100 |             resp_struct = json.loads(response.text)
101 |             answer = resp_struct['choices'][0]['message']['content'].strip()
102 |         except:
103 |             pass
104 |         return ret_code, answer, response
105 | 


--------------------------------------------------------------------------------
/vlmeval/api/glm_vision.py:
--------------------------------------------------------------------------------
  1 | from vlmeval.smp import *
  2 | from vlmeval.api.base import BaseAPI
  3 | from vlmeval.utils.dataset import DATASET_TYPE
  4 | from vlmeval.smp.vlm import encode_image_file_to_base64
  5 | 
  6 | 
  7 | class GLMVisionWrapper(BaseAPI):
  8 | 
  9 |     is_api: bool = True
 10 | 
 11 |     def __init__(self,
 12 |                  model: str,
 13 |                  retry: int = 5,
 14 |                  wait: int = 5,
 15 |                  key: str = None,
 16 |                  verbose: bool = True,
 17 |                  system_prompt: str = None,
 18 |                  max_tokens: int = 1024,
 19 |                  proxy: str = None,
 20 |                  **kwargs):
 21 | 
 22 |         self.model = model
 23 |         self.fail_msg = 'Failed to obtain answer via API. '
 24 |         self.default_params = {
 25 |             'top_p': 0.6,
 26 |             'top_k': 2,
 27 |             'temperature': 0.8,
 28 |             'repetition_penalty': 1.1,
 29 |             'best_of': 1,
 30 |             'do_sample': True,
 31 |             'stream': False,
 32 |             'max_tokens': max_tokens
 33 |         }
 34 |         if key is None:
 35 |             key = os.environ.get('GLMV_API_KEY', None)
 36 |         assert key is not None, (
 37 |             'Please set the API Key (obtain it here: '
 38 |             'https://open.bigmodel.cn/dev/howuse/introduction)'
 39 |         )
 40 |         self.key = key
 41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
 42 | 
 43 |     def image_to_base64(self, image_path):
 44 |         import base64
 45 |         with open(image_path, 'rb') as image_file:
 46 |             encoded_string = base64.b64encode(image_file.read())
 47 |             return encoded_string.decode('utf-8')
 48 | 
 49 |     def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
 50 |         msgs = cp.deepcopy(msgs_raw)
 51 |         content = []
 52 |         text = ''
 53 |         for i, msg in enumerate(msgs):
 54 |             if msg['type'] == 'text':
 55 |                 text += msg['value']
 56 |             elif msg['type'] == 'image':
 57 |                 content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
 58 |         if dataset is not None and DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
 59 |             text += '\nShort Answer.'
 60 |         content.append(dict(type='text', text=text))
 61 |         ret = [dict(role='user', content=content)]
 62 |         return ret
 63 | 
 64 |     def generate_inner(self, inputs, **kwargs) -> str:
 65 |         assert isinstance(inputs, str) or isinstance(inputs, list)
 66 |         inputs = [inputs] if isinstance(inputs, str) else inputs
 67 | 
 68 |         messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
 69 | 
 70 |         url = 'https://api.chatglm.cn/v1/chat/completions'
 71 |         headers = {
 72 |             'Content-Type': 'application/json',
 73 |             'Request-Id': 'remote-test',
 74 |             'Authorization': f'Bearer {self.key}'
 75 |         }
 76 |         payload = {
 77 |             'model': self.model,
 78 |             'messages': messages,
 79 |             **self.default_params
 80 |         }
 81 |         response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
 82 |         output = []
 83 |         try:
 84 |             assert response.status_code == 200
 85 |             for line in response.iter_lines():
 86 |                 data = json.loads(line.decode('utf-8').lstrip('data: '))
 87 |                 output.append(data['choices'][0]['message']['content'])
 88 |             answer = ''.join(output).replace('</s>', '')
 89 |             if self.verbose:
 90 |                 self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
 91 |             return 0, answer, 'Succeeded! '
 92 |         except Exception as err:
 93 |             if self.verbose:
 94 |                 self.logger.error(err)
 95 |                 self.logger.error(f'The input messages are {inputs}.')
 96 |             return -1, self.fail_msg, ''
 97 | 
 98 | 
 99 | class GLMVisionAPI(GLMVisionWrapper):
100 | 
101 |     def generate(self, message, dataset=None):
102 |         return super(GLMVisionAPI, self).generate(message, dataset=dataset)
103 | 


--------------------------------------------------------------------------------
/vlmeval/api/gpt_int.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import warnings
 3 | import requests
 4 | from ..smp import *
 5 | from .gpt import GPT_context_window, OpenAIWrapper
 6 | 
 7 | url = 'http://ecs.sv.us.alles-apin.openxlab.org.cn/v1/openai/v2/text/chat'
 8 | headers = {
 9 |     'Content-Type': 'application/json'
10 | }
11 | 
12 | 
13 | class OpenAIWrapperInternal(OpenAIWrapper):
14 | 
15 |     is_api: bool = True
16 | 
17 |     def __init__(self,
18 |                  model: str = 'gpt-3.5-turbo-0125',
19 |                  retry: int = 5,
20 |                  wait: int = 3,
21 |                  verbose: bool = True,
22 |                  system_prompt: str = None,
23 |                  temperature: float = 0,
24 |                  timeout: int = 60,
25 |                  max_tokens: int = 1024,
26 |                  img_size: int = 512,
27 |                  img_detail: str = 'low',
28 |                  **kwargs):
29 | 
30 |         self.model = model
31 |         if 'KEYS' in os.environ and osp.exists(os.environ['KEYS']):
32 |             keys = load(os.environ['KEYS'])
33 |             headers['alles-apin-token'] = keys.get('alles-apin-token', '')
34 |         elif 'ALLES' in os.environ:
35 |             headers['alles-apin-token'] = os.environ['ALLES']
36 |         self.headers = headers
37 |         self.temperature = temperature
38 |         self.timeout = timeout
39 |         self.max_tokens = max_tokens
40 | 
41 |         assert img_size > 0 or img_size == -1
42 |         self.img_size = img_size
43 |         assert img_detail in ['high', 'low']
44 |         self.img_detail = img_detail
45 | 
46 |         super(OpenAIWrapper, self).__init__(
47 |             wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48 | 
49 |     def generate_inner(self, inputs, **kwargs) -> str:
50 |         input_msgs = self.prepare_inputs(inputs)
51 | 
52 |         temperature = kwargs.pop('temperature', self.temperature)
53 |         max_tokens = kwargs.pop('max_tokens', self.max_tokens)
54 | 
55 |         # Held out 100 tokens as buffer
56 |         context_window = GPT_context_window(self.model)
57 |         max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
58 |         if 0 < max_tokens <= 100:
59 |             print('Less than 100 tokens left, may exceed the context window with some additional meta symbols. ')
60 |         if max_tokens <= 0:
61 |             return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
62 | 
63 |         payload = dict(
64 |             model=self.model,
65 |             messages=input_msgs,
66 |             max_tokens=max_tokens,
67 |             n=1,
68 |             stop=None,
69 |             timeout=self.timeout,
70 |             temperature=temperature,
71 |             **kwargs)
72 | 
73 |         response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
74 |         ret_code = response.status_code
75 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
76 | 
77 |         answer = self.fail_msg
78 |         try:
79 |             resp_struct = json.loads(response.text)
80 |             assert resp_struct['msg'] == 'ok' and resp_struct['msgCode'] == '10000', resp_struct
81 |             answer = resp_struct['data']['choices'][0]['message']['content'].strip()
82 |         except:
83 |             pass
84 |         return ret_code, answer, response
85 | 
86 | 
87 | class GPT4V_Internal(OpenAIWrapperInternal):
88 | 
89 |     def generate(self, message, dataset=None):
90 |         return super(GPT4V_Internal, self).generate(message)
91 | 


--------------------------------------------------------------------------------
/vlmeval/api/qwen_api.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | import os
 3 | from vlmeval.api.base import BaseAPI
 4 | from vlmeval.smp import *
 5 | 
 6 | 
 7 | # Note: This is a pure language model API.
 8 | class QwenAPI(BaseAPI):
 9 | 
10 |     is_api: bool = True
11 | 
12 |     def __init__(self,
13 |                  model: str = 'qwen-max-1201',
14 |                  retry: int = 5,
15 |                  wait: int = 5,
16 |                  verbose: bool = True,
17 |                  seed: int = 2680,
18 |                  temperature: float = 0.0,
19 |                  system_prompt: str = None,
20 |                  key: str = None,
21 |                  max_tokens: int = 1024,
22 |                  proxy: str = None,
23 |                  **kwargs):
24 | 
25 |         assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26 |         self.model = model
27 |         import dashscope
28 |         self.fail_msg = 'Failed to obtain answer via API. '
29 |         self.max_tokens = max_tokens
30 |         self.temperature = temperature
31 |         self.seed = seed
32 |         if key is None:
33 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
34 |         assert key is not None, (
35 |             'Please set the API Key (obtain it here: '
36 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37 |         )
38 |         dashscope.api_key = key
39 |         if proxy is not None:
40 |             proxy_set(proxy)
41 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42 | 
43 |     @staticmethod
44 |     def build_msgs(msgs_raw, system_prompt=None):
45 |         msgs = cp.deepcopy(msgs_raw)
46 |         ret = []
47 |         if system_prompt is not None:
48 |             ret.append(dict(role='system', content=system_prompt))
49 |         for i, msg in enumerate(msgs):
50 |             role = 'user' if i % 2 == 0 else 'assistant'
51 |             ret.append(dict(role=role, content=msg))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         inputs = [inputs] if isinstance(inputs, str) else inputs
58 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59 | 
60 |         import dashscope
61 |         response = dashscope.Generation.call(
62 |             model=self.model,
63 |             messages=messages,
64 |             seed=self.seed,
65 |             temperature=self.temperature,
66 |             max_tokens=self.max_tokens,
67 |             result_format='message',  # set the result to be "message" format.
68 |         )
69 |         if response.status_code != HTTPStatus.OK:
70 |             return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71 | 
72 |         try:
73 |             return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74 |         except Exception as err:
75 |             return -1, f'Error: Failed to parse the response. {err}', response
76 | 


--------------------------------------------------------------------------------
/vlmeval/api/qwen_vl_api.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | 
 4 | 
 5 | class QwenVLWrapper(BaseAPI):
 6 | 
 7 |     is_api: bool = True
 8 | 
 9 |     def __init__(self,
10 |                  model: str = 'qwen-vl-plus',
11 |                  retry: int = 5,
12 |                  wait: int = 5,
13 |                  key: str = None,
14 |                  verbose: bool = True,
15 |                  temperature: float = 0.0,
16 |                  system_prompt: str = None,
17 |                  max_tokens: int = 1024,
18 |                  proxy: str = None,
19 |                  **kwargs):
20 | 
21 |         assert model in ['qwen-vl-plus', 'qwen-vl-max']
22 |         self.model = model
23 |         import dashscope
24 |         self.fail_msg = 'Failed to obtain answer via API. '
25 |         self.max_tokens = max_tokens
26 |         self.temperature = temperature
27 |         if key is None:
28 |             key = os.environ.get('DASHSCOPE_API_KEY', None)
29 |         assert key is not None, (
30 |             'Please set the API Key (obtain it here: '
31 |             'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
32 |         )
33 |         dashscope.api_key = key
34 |         if proxy is not None:
35 |             proxy_set(proxy)
36 |         super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
37 | 
38 |     @staticmethod
39 |     def build_msgs(msgs_raw, system_prompt=None):
40 |         msgs = cp.deepcopy(msgs_raw)
41 |         ret = []
42 |         if system_prompt is not None:
43 |             content = list(dict(text=system_prompt))
44 |             ret.append(dict(role='system', content=content))
45 |         content = []
46 |         for msg in msgs:
47 |             if msg['type'] == 'text':
48 |                 content.append(dict(text=msg['value']))
49 |             elif msg['type'] == 'image':
50 |                 content.append(dict(image='file://' + msg['value']))
51 |         ret.append(dict(role='user', content=content))
52 |         return ret
53 | 
54 |     def generate_inner(self, inputs, **kwargs) -> str:
55 |         from dashscope import MultiModalConversation
56 |         assert isinstance(inputs, str) or isinstance(inputs, list)
57 |         pure_text = np.all([x['type'] == 'text' for x in inputs])
58 |         assert not pure_text
59 |         messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
60 |         gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
61 |         gen_config.update(kwargs)
62 |         try:
63 |             response = MultiModalConversation.call(model=self.model, messages=messages)
64 |             if self.verbose:
65 |                 print(response)
66 |             answer = response.output.choices[0]['message']['content'][0]['text']
67 |             return 0, answer, 'Succeeded! '
68 |         except Exception as err:
69 |             if self.verbose:
70 |                 self.logger.error(err)
71 |                 self.logger.error(f'The input messages are {inputs}.')
72 | 
73 |             return -1, '', ''
74 | 
75 | 
76 | class QwenVLAPI(QwenVLWrapper):
77 | 
78 |     def generate(self, message, dataset=None):
79 |         return super(QwenVLAPI, self).generate(message)
80 | 


--------------------------------------------------------------------------------
/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | from time import sleep
 4 | import mimetypes
 5 | 
 6 | 
 7 | class Reka_Wrapper(BaseAPI):
 8 | 
 9 |     is_api: bool = True
10 |     INTERLEAVE: bool = False
11 | 
12 |     def __init__(self,
13 |                  model: str = 'reka-flash-20240226',
14 |                  key: str = None,
15 |                  retry: int = 10,
16 |                  wait: int = 3,
17 |                  system_prompt: str = None,
18 |                  verbose: bool = True,
19 |                  temperature: float = 0,
20 |                  max_tokens: int = 1024,
21 |                  **kwargs):
22 | 
23 |         try:
24 |             import reka
25 |         except ImportError:
26 |             raise ImportError('Please install reka by running "pip install reka-api"')
27 | 
28 |         self.model = model
29 |         default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 |         default_kwargs.update(kwargs)
31 |         self.kwargs = default_kwargs
32 |         if key is not None:
33 |             self.key = key
34 |         else:
35 |             self.key = os.environ.get('REKA_API_KEY', '')
36 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 | 
38 |     def generate_inner(self, inputs, **kwargs) -> str:
39 |         import reka
40 |         reka.API_KEY = self.key
41 |         prompt, image_path = self.message_to_promptimg(inputs)
42 |         image_b64 = encode_image_file_to_base64(image_path)
43 | 
44 |         response = reka.chat(
45 |             model_name=self.model,
46 |             human=prompt,
47 |             media_url=f'data:image/jpeg;base64,{image_b64}',
48 |             **self.kwargs)
49 | 
50 |         try:
51 |             return 0, response['text'], response
52 |         except:
53 |             return -1, self.fail_msg, response
54 | 
55 | 
56 | class Reka(Reka_Wrapper):
57 | 
58 |     def generate(self, message, dataset=None):
59 |         return super(Reka_Wrapper, self).generate(message)
60 | 


--------------------------------------------------------------------------------
/vlmeval/api/stepai.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from vlmeval.api.base import BaseAPI
 3 | 
 4 | url = 'https://api.stepfun.com/v1/chat/completions'
 5 | headers = {
 6 |     'Content-Type': 'application/json',
 7 |     'Authorization': 'Bearer {}',
 8 | }
 9 | 
10 | 
11 | class StepAPI_INT(BaseAPI):
12 | 
13 |     is_api: bool = True
14 | 
15 |     def __init__(self,
16 |                  model: str = 'step-1v-8k',
17 |                  retry: int = 10,
18 |                  wait: int = 3,
19 |                  key: str = None,
20 |                  temperature: float = 0,
21 |                  max_tokens: int = 300,
22 |                  verbose: bool = True,
23 |                  system_prompt: str = None,
24 |                  **kwargs):
25 |         self.model = model
26 |         self.fail_msg = 'Fail to obtain answer via API.'
27 |         self.headers = headers
28 |         self.temperature = temperature
29 |         self.max_tokens = max_tokens
30 |         self.system_prompt = system_prompt
31 |         if key is not None:
32 |             self.key = key
33 |         else:
34 |             self.key = os.environ.get('STEPAI_API_KEY', '')
35 |         headers['Authorization'] = headers['Authorization'].format(self.key)
36 | 
37 |         super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
38 | 
39 |     @staticmethod
40 |     def build_msgs(msgs_raw):
41 |         messages = []
42 |         message = {'role': 'user', 'content': []}
43 | 
44 |         for msg in msgs_raw:
45 |             if msg['type'] == 'image':
46 |                 image_b64 = encode_image_file_to_base64(msg['value'])
47 |                 message['content'].append({
48 |                     'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
49 |                     'type': 'image_url'
50 |                 })
51 |             elif msg['type'] == 'text':
52 |                 message['content'].append({
53 |                     'text': msg['value'],
54 |                     'type': 'text'
55 |                 })
56 | 
57 |         messages.append(message)
58 |         return messages
59 | 
60 |     def generate_inner(self, inputs, **kwargs) -> str:
61 |         print(inputs, '\n')
62 |         payload = dict(
63 |             model=self.model,
64 |             max_tokens=self.max_tokens,
65 |             temperature=self.temperature,
66 |             messages=self.build_msgs(msgs_raw=inputs),
67 |             **kwargs)
68 |         response = requests.post(url, headers=headers, data=json.dumps(payload))
69 |         ret_code = response.status_code
70 |         ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
71 | 
72 |         answer = self.fail_msg
73 |         try:
74 |             resp_struct = json.loads(response.text)
75 |             answer = resp_struct['choices'][0]['message']['content'].strip()
76 |         except:
77 |             pass
78 |         return ret_code, answer, response
79 | 
80 | 
81 | class Step1V_INT(StepAPI_INT):
82 | 
83 |     def generate(self, message, dataset=None):
84 |         return super(StepAPI_INT, self).generate(message)
85 | 


--------------------------------------------------------------------------------
/vlmeval/evaluate/OCRBench.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | 
 3 | 
 4 | def OCRBench_eval(eval_file):
 5 |     OCRBench_score = {
 6 |         'Regular Text Recognition': 0,
 7 |         'Irregular Text Recognition': 0,
 8 |         'Artistic Text Recognition': 0,
 9 |         'Handwriting Recognition': 0,
10 |         'Digit String Recognition': 0,
11 |         'Non-Semantic Text Recognition': 0,
12 |         'Scene Text-centric VQA': 0,
13 |         'Doc-oriented VQA': 0,
14 |         'Key Information Extraction': 0,
15 |         'Handwritten Mathematical Expression Recognition': 0
16 |     }
17 | 
18 |     logger = get_logger('Evaluation')
19 | 
20 |     data = load(eval_file)
21 |     lt = len(data)
22 |     lines = [data.iloc[i] for i in range(lt)]
23 |     for i in tqdm(range(len(lines))):
24 |         line = lines[i]
25 |         predict = str(line['prediction'])
26 |         answers = eval(line['answer'])
27 |         category = line['category']
28 |         if category == 'Handwritten Mathematical Expression Recognition':
29 |             for j in range(len(answers)):
30 |                 answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
31 |                 predict = predict.strip().replace('\n', ' ').replace(' ', '')
32 |                 if answer in predict:
33 |                     OCRBench_score[category] += 1
34 |                     break
35 |         else:
36 |             for j in range(len(answers)):
37 |                 answer = answers[j].lower().strip().replace('\n', ' ')
38 |                 predict = predict.lower().strip().replace('\n', ' ')
39 |                 if answer in predict:
40 |                     OCRBench_score[category] += 1
41 |                     break
42 | 
43 |     final_score_dict = {}
44 |     final_score_dict['Text Recognition'] = (
45 |         OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
46 |         + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
47 |         + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']
48 |     )
49 |     final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
50 |     final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
51 |     final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
52 |     final_score_dict['Handwritten Mathematical Expression Recognition'] = \
53 |         OCRBench_score['Handwritten Mathematical Expression Recognition']
54 |     final_score_dict['Final Score'] = (
55 |         final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
56 |         + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
57 |         + final_score_dict['Handwritten Mathematical Expression Recognition']
58 |     )
59 |     final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score']) / 10
60 |     score_pth = eval_file.replace('.xlsx', '_score.json')
61 |     dump(final_score_dict, score_pth)
62 |     logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
63 |     logger.info('Score: ')
64 |     for key, value in final_score_dict.items():
65 |         logger.info('{}:{}'.format(key, value))
66 | 


--------------------------------------------------------------------------------
/vlmeval/evaluate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .yes_or_no import default_rating, MME_rating, YOrN_eval
 2 | from .mmvet_eval import MMVet_eval
 3 | from .multiple_choice import multiple_choice_eval
 4 | from .coco_eval import COCO_eval
 5 | from .vqa_eval import VQAEval
 6 | from .mathvista_eval import MathVista_eval
 7 | from .llavabench import LLaVABench_eval
 8 | from .misc import build_judge
 9 | from .OCRBench import OCRBench_eval
10 | 


--------------------------------------------------------------------------------
/vlmeval/evaluate/coco_eval.py:
--------------------------------------------------------------------------------
 1 | from vlmeval.smp import *
 2 | from pycocoevalcap.bleu.bleu import Bleu
 3 | from pycocoevalcap.rouge.rouge import Rouge
 4 | from pycocoevalcap.cider.cider import Cider
 5 | 
 6 | 
 7 | class COCO_Caption_Scorer():
 8 |     def __init__(self, ref, gt):
 9 |         self.ref = ref
10 |         self.gt = gt
11 |         print('setting up scorers...')
12 |         self.scorers = [
13 |             (Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']),
14 |             # (Meteor(), "METEOR"), # need java version 11.0.16+
15 |             (Rouge(), 'ROUGE_L'),
16 |             (Cider(), 'CIDEr'),
17 |             # (Spice(), "SPICE"), # need java version 11.0.16+
18 |         ]
19 | 
20 |     def compute_scores(self):
21 |         total_scores = {}
22 |         for scorer, method in self.scorers:
23 |             print('computing %s score...' % (scorer.method()))
24 |             score, scores = scorer.compute_score(self.gt, self.ref)
25 |             if type(method) == list:
26 |                 for sc, scs, m in zip(score, scores, method):
27 |                     print('%s: %0.3f' % (m, sc * 100))
28 |                 total_scores['Bleu'] = [x * 100 for x in score]
29 |             else:
30 |                 print('%s: %0.3f' % (method, score * 100))
31 |                 total_scores[method] = score * 100
32 | 
33 |         print('*****DONE*****')
34 |         for key, value in total_scores.items():
35 |             print('{}:{}'.format(key, value))
36 |         return total_scores
37 | 
38 | 
39 | def COCO_eval(eval_file, nproc=4, verbose=False):
40 |     logger = get_logger('Evaluation')
41 | 
42 |     data = load(eval_file)
43 | 
44 |     lt = len(data)
45 |     lines = [data.iloc[i] for i in range(lt)]
46 |     ref = {}
47 |     gt = {}
48 |     for i, line in enumerate(lines):
49 |         ref[str(i)] = [str(line['prediction'])]
50 |         gt[str(i)] = eval(line['answer'])
51 | 
52 |     scorer = COCO_Caption_Scorer(ref, gt)
53 |     coco_caption_score_dict = scorer.compute_scores()
54 | 
55 |     score_pth = eval_file.replace('.xlsx', '_score.json')
56 |     dump(coco_caption_score_dict, score_pth)
57 |     logger.info(f'COCO_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
58 |     logger.info('Score: ')
59 |     for key, value in coco_caption_score_dict.items():
60 |         logger.info('{}:{}'.format(key, value))
61 | 
62 | 
63 | def parse_args():
64 |     parser = argparse.ArgumentParser(description='Inference LLM Answers. ')
65 |     parser.add_argument('--data', type=str, help='The question set for inference, in excel / tsv / json format. ')
66 |     parser.add_argument('--nproc', type=int, default=4)
67 |     parser.add_argument('--verbose', action='store_true')
68 |     args = parser.parse_args()
69 |     return args
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = parse_args()
74 |     COCO_eval(eval_file=args.data, nproc=args.nproc, verbose=args.verbose)
75 | 


--------------------------------------------------------------------------------
/vlmeval/evaluate/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from vlmeval.api import OpenAIWrapper, OpenAIWrapperInternal
 3 | from vlmeval.smp import load_env
 4 | 
 5 | INTERNAL = os.environ.get('INTERNAL', 0)
 6 | 
 7 | 
 8 | def build_judge(**kwargs):
 9 |     model = kwargs.pop('model', None)
10 |     load_env()
11 |     LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
12 |     if LOCAL_LLM is None:
13 |         model_map = {
14 |             'gpt-4-turbo': 'gpt-4-1106-preview',
15 |             'gpt-4-0613': 'gpt-4-0613',
16 |             'gpt-4-0125': 'gpt-4-0125-preview',
17 |             'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
18 |             'chatgpt-1106': 'gpt-3.5-turbo-1106',
19 |             'chatgpt-0125': 'gpt-3.5-turbo-0125',
20 |         }
21 |         model_version = model_map[model]
22 |     else:
23 |         model_version = LOCAL_LLM
24 |     if INTERNAL:
25 |         model = OpenAIWrapperInternal(model_version, **kwargs)
26 |     else:
27 |         model = OpenAIWrapper(model_version, **kwargs)
28 |     return model
29 | 


--------------------------------------------------------------------------------
/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 | 


--------------------------------------------------------------------------------
/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger_initialized = {}
 4 | 
 5 | 
 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
 7 |     logger = logging.getLogger(name)
 8 |     if name in logger_initialized:
 9 |         return logger
10 | 
11 |     for logger_name in logger_initialized:
12 |         if name.startswith(logger_name):
13 |             return logger
14 | 
15 |     stream_handler = logging.StreamHandler()
16 |     handlers = [stream_handler]
17 | 
18 |     try:
19 |         import torch.distributed as dist
20 |         if dist.is_available() and dist.is_initialized():
21 |             rank = dist.get_rank()
22 |         else:
23 |             rank = 0
24 |     except ImportError:
25 |         rank = 0
26 | 
27 |     if rank == 0 and log_file is not None:
28 |         file_handler = logging.FileHandler(log_file, file_mode)
29 |         handlers.append(file_handler)
30 | 
31 |     formatter = logging.Formatter(
32 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33 |     for handler in handlers:
34 |         handler.setFormatter(formatter)
35 |         handler.setLevel(log_level)
36 |         logger.addHandler(handler)
37 | 
38 |     if rank == 0:
39 |         logger.setLevel(log_level)
40 |     else:
41 |         logger.setLevel(logging.ERROR)
42 | 
43 |     logger_initialized[name] = True
44 |     return logger
45 | 


--------------------------------------------------------------------------------
/vlmeval/smp/vlm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import io
  3 | import pandas as pd
  4 | import numpy as np
  5 | import string
  6 | from uuid import uuid4
  7 | import os.path as osp
  8 | import base64
  9 | from PIL import Image
 10 | from .file import load, dump
 11 | Image.MAX_IMAGE_PIXELS = 1e9
 12 | 
 13 | 
 14 | def mmqa_display(question, target_size=512):
 15 |     question = {k.lower(): v for k, v in question.items()}
 16 |     keys = list(question.keys())
 17 |     keys = [k for k in keys if k not in ['index', 'image']]
 18 | 
 19 |     images = question['image']
 20 |     if isinstance(images, str):
 21 |         images = [images]
 22 | 
 23 |     idx = question.pop('index', 'XXX')
 24 |     print(f'INDEX: {idx}')
 25 | 
 26 |     for im in images:
 27 |         image = decode_base64_to_image(im, target_size=target_size)
 28 |         display(image)  # noqa: F821
 29 | 
 30 |     for k in keys:
 31 |         try:
 32 |             if not pd.isna(question[k]):
 33 |                 print(f'{k.upper()}. {question[k]}')
 34 |         except ValueError:
 35 |             if False in pd.isna(question[k]):
 36 |                 print(f'{k.upper()}. {question[k]}')
 37 | 
 38 | 
 39 | def encode_image_to_base64(img, target_size=-1):
 40 |     # if target_size == -1, will not do resizing
 41 |     # else, will set the max_size ot (target_size, target_size)
 42 |     if img.mode in ('RGBA', 'P'):
 43 |         img = img.convert('RGB')
 44 |     tmp = osp.join('/tmp', str(uuid4()) + '.jpg')
 45 |     if target_size > 0:
 46 |         img.thumbnail((target_size, target_size))
 47 |     img.save(tmp)
 48 |     with open(tmp, 'rb') as image_file:
 49 |         image_data = image_file.read()
 50 |     ret = base64.b64encode(image_data).decode('utf-8')
 51 |     os.remove(tmp)
 52 |     return ret
 53 | 
 54 | 
 55 | def encode_image_file_to_base64(image_path, target_size=-1):
 56 |     image = Image.open(image_path)
 57 |     return encode_image_to_base64(image, target_size=target_size)
 58 | 
 59 | 
 60 | def decode_base64_to_image(base64_string, target_size=-1):
 61 |     image_data = base64.b64decode(base64_string)
 62 |     image = Image.open(io.BytesIO(image_data))
 63 |     if image.mode in ('RGBA', 'P'):
 64 |         image = image.convert('RGB')
 65 |     if target_size > 0:
 66 |         image.thumbnail((target_size, target_size))
 67 |     return image
 68 | 
 69 | 
 70 | def decode_base64_to_image_file(base64_string, image_path, target_size=-1):
 71 |     image = decode_base64_to_image(base64_string, target_size=target_size)
 72 |     image.save(image_path)
 73 | 
 74 | 
 75 | def build_option_str(option_dict):
 76 |     s = 'There are several options: \n'
 77 |     for c, content in option_dict.items():
 78 |         if not pd.isna(content):
 79 |             s += f'{c}. {content}\n'
 80 |     return s
 81 | 
 82 | 
 83 | def isimg(s):
 84 |     return osp.exists(s) or s.startswith('http')
 85 | 
 86 | 
 87 | def read_ok(img_path):
 88 |     if not osp.exists(img_path):
 89 |         return False
 90 |     try:
 91 |         im = Image.open(img_path)
 92 |         assert im.size[0] > 0 and im.size[1] > 0
 93 |         return True
 94 |     except:
 95 |         return False
 96 | 
 97 | 
 98 | def gpt_key_set():
 99 |     openai_key = os.environ.get('OPENAI_API_KEY', None)
100 |     return isinstance(openai_key, str) and openai_key.startswith('sk-')
101 | 
102 | 
103 | def apiok(wrapper):
104 |     s = wrapper.generate('Hello!')
105 |     return wrapper.fail_msg not in s
106 | 
107 | 
108 | def circular_pred(df, extract_func=None):
109 |     if extract_func is None:
110 |         extract_func = lambda x: x  # noqa: E731
111 |     df = df.sort_values('index')
112 |     from vlmeval.utils import can_infer_option
113 |     shift = int(1e6)
114 | 
115 |     choices = [extract_func(x) for x in df['prediction']]
116 |     pred_map = {i: c for i, c in zip(df['index'], choices)}
117 |     flag_map = {i: True for i in pred_map if i < 1e6}
118 |     valid_map = {i: True for i in pred_map if i < 1e6}
119 |     for i in df['index']:
120 |         if i >= shift and pred_map[i] and pred_map[i - shift]:
121 |             if (
122 |                 pred_map[i] not in list(string.ascii_uppercase) or  # noqa: W504
123 |                 pred_map[i - shift] not in list(string.ascii_uppercase)
124 |             ):
125 | 
126 |                 valid_map[i % shift] = False
127 |                 continue
128 |             if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1:
129 |                 continue
130 |             else:
131 |                 flag_map[i % shift] = False
132 |     flag_map = {k: v for k, v in flag_map.items() if valid_map[k]}
133 |     flags = list(flag_map.values())
134 |     return np.mean(flags)
135 | 


--------------------------------------------------------------------------------
/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .matching_util import can_infer, can_infer_option, can_infer_text
 2 | from .mp_util import track_progress_rich
 3 | from .custom_prompt import CustomPrompt
 4 | from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
 5 | from .dataset import TSVDataset, split_MMMU
 6 | from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer
 7 | 
 8 | 
 9 | __all__ = [
10 |     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
11 |     'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
12 |     'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer'
13 | ]
14 | 


--------------------------------------------------------------------------------
/vlmeval/utils/custom_prompt.py:
--------------------------------------------------------------------------------
 1 | from ..smp import *
 2 | from .dataset_config import img_root_map
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class CustomPrompt:
 7 | 
 8 |     @abstractmethod
 9 |     def use_custom_prompt(self, dataset):
10 |         raise NotImplementedError
11 | 
12 |     @abstractmethod
13 |     def build_prompt(self, line, dataset):
14 |         raise NotImplementedError
15 | 
16 |     def dump_image(self, line, dataset):
17 |         ROOT = LMUDataRoot()
18 |         assert isinstance(dataset, str)
19 |         img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
20 |         os.makedirs(img_root, exist_ok=True)
21 | 
22 |         if 'image' in line:
23 |             if isinstance(line['image'], list):
24 |                 tgt_path = []
25 |                 assert 'image_path' in line
26 |                 for img, im_name in zip(line['image'], line['image_path']):
27 |                     path = osp.join(img_root, im_name)
28 |                     if not read_ok(path):
29 |                         decode_base64_to_image_file(img, path)
30 |                     tgt_path.append(path)
31 |             else:
32 |                 tgt_path = osp.join(img_root, f"{line['index']}.jpg")
33 |                 if not read_ok(tgt_path):
34 |                     decode_base64_to_image_file(line['image'], tgt_path)
35 |                 tgt_path = [tgt_path]
36 |         else:
37 |             assert 'image_path' in line
38 |             tgt_path = toliststr(line['image_path'])
39 | 
40 |         return tgt_path
41 | 


--------------------------------------------------------------------------------
/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import copy as cp
 3 | import os
 4 | from ..smp import *
 5 | 
 6 | 
 7 | def can_infer_option(answer, choices):
 8 |     verbose = os.environ.get('VERBOSE', 0)
 9 |     # Choices is a dictionary
10 |     if 'Failed to obtain answer via API' in answer:
11 |         return False
12 | 
13 |     reject_to_answer = [
14 |         "Sorry, I can't help with images of people yet.",
15 |         "I can't process this file.",
16 |         "I'm sorry, but without the image provided",
17 |         'Cannot determine the answer'
18 |     ]
19 |     for err in reject_to_answer:
20 |         if err in answer:
21 |             return 'Z'
22 | 
23 |     def count_choice(splits, choices, prefix='', suffix=''):
24 |         cnt = 0
25 |         for c in choices:
26 |             if prefix + c + suffix in splits:
27 |                 cnt += 1
28 |         return cnt
29 | 
30 |     answer_mod = cp.copy(answer)
31 |     chars = '.()[],:;!*#{}'
32 |     for c in chars:
33 |         answer_mod = answer_mod.replace(c, ' ')
34 | 
35 |     splits = [x.strip() for x in answer_mod.split()]
36 |     count = count_choice(splits, choices)
37 | 
38 |     if count == 1:
39 |         for ch in choices:
40 |             if 'A' in splits and len(splits) > 3 and verbose:
41 |                 logger = get_logger('Evaluation')
42 |                 logger.info(f'A might be a quantifier in the string: {answer}.')
43 |                 return False
44 |             if ch in splits:
45 |                 return ch
46 |     elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 |         return 'Z'
48 |     return False
49 | 
50 | 
51 | def can_infer_text(answer, choices):
52 |     answer = answer.lower()
53 |     assert isinstance(choices, dict)
54 |     for k in choices:
55 |         assert k in string.ascii_uppercase
56 |         choices[k] = str(choices[k]).lower()
57 |     cands = []
58 |     for k in choices:
59 |         if choices[k] in answer:
60 |             cands.append(k)
61 |     if len(cands) == 1:
62 |         return cands[0]
63 |     return False
64 | 
65 | 
66 | def can_infer(answer, choices):
67 |     answer = str(answer)
68 |     copt = can_infer_option(answer, choices)
69 |     return copt if copt else can_infer_text(answer, choices)
70 | 


--------------------------------------------------------------------------------
/vlmeval/utils/result_transfer.py:
--------------------------------------------------------------------------------
 1 | from ..evaluate.misc import build_judge
 2 | from ..evaluate.multiple_choice import extract_answer_from_item
 3 | 
 4 | from ..smp import *
 5 | from .matching_util import can_infer
 6 | from .mp_util import track_progress_rich
 7 | 
 8 | 
 9 | def MMMU_result_transfer(result_path):
10 |     res = {}
11 |     result_data = load(result_path)
12 |     mcq = result_data['A'].notna()
13 |     lt = len(result_data)
14 |     for i in range(lt):
15 |         line = result_data.iloc[i]
16 |         if mcq[i]:
17 |             options = {
18 |                 cand: line[cand]
19 |                 for cand in string.ascii_uppercase
20 |                 if cand in line and not pd.isna(line[cand])
21 |             }
22 |             prediction = line['prediction']
23 |             infer_prediction = can_infer(prediction, options)
24 |             res[line['id']] = infer_prediction
25 |         else:
26 |             res[line['id']] = line['prediction']
27 |     result_json = result_path.replace('.xlsx', '.json')
28 |     dump(res, result_json)
29 |     return result_json
30 | 
31 | 
32 | def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
33 |     logger = get_logger('Evaluation')
34 |     INTERNAL = os.environ.get('INTERNAL', 0)
35 |     nproc = judge_kwargs.pop('nproc', 4)
36 | 
37 |     rd.seed(2680)
38 |     suffix = eval_file.split('.')[-1]
39 |     model = judge_kwargs['model']
40 |     assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
41 |     name_str_map = {
42 |         'chatgpt-0125': 'openai',
43 |         'gpt-4-0125': 'gpt4'
44 |     }
45 |     name_str = name_str_map[model] if model in name_str_map else model
46 | 
47 |     if model == 'exact_matching':
48 |         model = None
49 |     else:
50 |         if INTERNAL or gpt_key_set():
51 |             model = build_judge(**judge_kwargs)
52 |         else:
53 |             logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
54 |             model = None
55 | 
56 |     logger.info(f'Evaluating {eval_file}')
57 |     result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
58 |     result = {}
59 |     if osp.exists(result_file):
60 |         result = load(result_file)
61 | 
62 |     data = load(eval_file)
63 |     assert 'index' in data, 'Essentail columns missing in the eval_file.'
64 | 
65 |     data = data.sort_values(by='index')
66 |     data['prediction'] = [str(x) for x in data['prediction']]
67 |     for k in data.keys():
68 |         data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
69 | 
70 |     idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
71 |     idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
72 | 
73 |     indices = list(idx2lines.keys())
74 |     lines = [idx2lines[i] for i in indices]
75 |     tups = [(model, line) for line in lines]
76 |     res = track_progress_rich(
77 |         extract_answer_from_item,
78 |         tups,
79 |         nproc=nproc,
80 |         chunksize=nproc,
81 |         save=result_file,
82 |         keys=indices)
83 | 
84 |     for i, r in zip(indices, res):
85 |         if i in result:
86 |             assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
87 |         else:
88 |             result[i] = r
89 | 
90 |     indices = list(data['index'])
91 |     data['opt'] = [result[i]['opt'] for i in data['index']]
92 |     data['log'] = [result[i]['log'] for i in data['index']]
93 | 
94 |     # load split
95 |     output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
96 |     dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
97 |     return output_path
98 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | torch.set_grad_enabled(False)
 4 | torch.manual_seed(1234)
 5 | from .base import BaseModel
 6 | from .cogvlm import CogVlm, GLM4v
 7 | from .emu import Emu
 8 | from .idefics import IDEFICS, IDEFICS2
 9 | from .instructblip import InstructBLIP
10 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner
11 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V
12 | from .minigpt4 import MiniGPT4
13 | from .mmalaya import MMAlaya
14 | from .monkey import Monkey, MonkeyChat
15 | from .mplug_owl2 import mPLUG_Owl2
16 | from .omnilmm import OmniLMM12B
17 | from .open_flamingo import OpenFlamingo
18 | from .pandagpt import PandaGPT
19 | from .qwen_vl import QwenVL, QwenVLChat
20 | from .transcore_m import TransCoreM
21 | from .visualglm import VisualGLM
22 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD
23 | from .yi_vl import Yi_VL
24 | from .internvl_chat import InternVLChat
25 | from .deepseek_vl import DeepSeekVL
26 | from .mgm import Mini_Gemini
27 | from .bunnyllama3 import BunnyLLama3
28 | from .vxverse import VXVERSE
29 | from .paligemma import PaliGemma
30 | from .qh_360vl import QH_360VL
31 | from .phi3_vision import Phi3Vision
32 | from .wemm import WeMM
33 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/bunnyllama3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from PIL import Image
 5 | import warnings
 6 | 
 7 | from .base import BaseModel
 8 | from ..smp import *
 9 | from ..utils import DATASET_TYPE
10 | 
11 | 
12 | class BunnyLLama3(BaseModel):
13 | 
14 |     INSTALL_REQ = False
15 |     INTERLEAVE = False
16 | 
17 |     def __init__(self, model_path='BAAI/Bunny-Llama-3-8B-V', **kwargs):
18 |         assert model_path is not None
19 |         transformers.logging.set_verbosity_error()
20 |         transformers.logging.disable_progress_bar()
21 |         warnings.filterwarnings('ignore')
22 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
23 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
24 |         self.kwargs = kwargs
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         prompt, image_path = self.message_to_promptimg(message)
28 |         text = f"A chat between a curious user and an artificial intelligence assistant. \
29 |             The assistant gives helpful, detailed, and polite answers to the user's questions. \
30 |             USER: <image>\n{prompt} ASSISTANT:"
31 |         text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('<image>')]
32 |         input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0)
33 |         image = Image.open(image_path).convert('RGB')
34 |         image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype)
35 | 
36 |         output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=100, use_cache=True)[0]
37 |         response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True)
38 |         return response
39 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/deepseek_vl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | from transformers import AutoModelForCausalLM
 4 | import warnings
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class DeepSeekVL(BaseModel):
 9 | 
10 |     INSTALL_REQ = True
11 |     INTERLEAVE = True
12 | 
13 |     def check_install(self):
14 |         try:
15 |             import deepseek_vl
16 |         except ImportError:
17 |             warnings.warn(
18 |                 'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
19 |             sys.exit(-1)
20 | 
21 |     def __init__(self, model_path='deepseek-ai/deepseek-vl-1.3b-chat', **kwargs):
22 |         self.check_install()
23 |         assert model_path is not None
24 |         self.model_path = model_path
25 |         from deepseek_vl.models import VLChatProcessor
26 | 
27 |         self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
28 |         self.tokenizer = self.vl_chat_processor.tokenizer
29 | 
30 |         model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
31 |         self.model = model.to(torch.bfloat16).cuda().eval()
32 | 
33 |         torch.cuda.empty_cache()
34 |         default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
35 |         default_kwargs.update(kwargs)
36 |         self.kwargs = default_kwargs
37 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
38 | 
39 |     def prepare_inputs(self, message):
40 |         content, images = '', []
41 |         for s in message:
42 |             if s['type'] == 'image':
43 |                 images.append(s['value'])
44 |                 content += '<image_placeholder>'
45 |             elif s['type'] == 'text':
46 |                 content += s['value']
47 |         conversation = [
48 |             dict(role='User', content=content, images=images),
49 |             dict(role='Assistant', content='')
50 |         ]
51 |         return conversation
52 | 
53 |     def generate_inner(self, message, dataset=None):
54 |         conversation = self.prepare_inputs(message)
55 |         from deepseek_vl.utils.io import load_pil_images
56 |         pil_images = load_pil_images(conversation)
57 |         prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
58 |         prepare_inputs = prepare_inputs.to(self.model.device)
59 |         inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
60 | 
61 |         outputs = self.model.language_model.generate(
62 |             inputs_embeds=inputs_embeds,
63 |             attention_mask=prepare_inputs.attention_mask,
64 |             pad_token_id=self.tokenizer.eos_token_id,
65 |             bos_token_id=self.tokenizer.bos_token_id,
66 |             eos_token_id=self.tokenizer.eos_token_id,
67 |             **self.kwargs)
68 |         answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
69 |         return answer
70 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/emu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from PIL import Image
 4 | import os.path as osp
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class Emu(BaseModel):
10 | 
11 |     INSTALL_REQ = False
12 |     INTERLEAVE = True
13 | 
14 |     def __init__(self,
15 |                  model_path='BAAI/Emu2-Chat',
16 |                  **kwargs):
17 | 
18 |         self.model_path = model_path
19 |         assert osp.exists(model_path) or splitlen(model_path) == 2
20 | 
21 |         from transformers import AutoModelForCausalLM, AutoTokenizer
22 |         from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
23 | 
24 |         local_rank = os.environ.get('LOCAL_RANK', 0)
25 | 
26 |         device_num = torch.cuda.device_count()
27 |         assert local_rank * 2 <= device_num, 'The number of devices does not match the world size'
28 |         assert device_num >= 2, 'You need at least 2 GPUs to use EMU'
29 | 
30 |         device_1 = local_rank
31 |         device_2 = local_rank + device_num // 2
32 | 
33 |         torch.cuda.set_device(device_1)
34 |         torch.cuda.set_device(device_2)
35 | 
36 |         tokenizer = AutoTokenizer.from_pretrained(model_path)  # "BAAI/Emu2-Chat"
37 |         self.tokenizer = tokenizer
38 |         with init_empty_weights():
39 |             model = AutoModelForCausalLM.from_pretrained(
40 |                 model_path,  # "BAAI/Emu2-Chat"
41 |                 torch_dtype=torch.bfloat16,
42 |                 low_cpu_mem_usage=True,
43 |                 trust_remote_code=True)
44 | 
45 |         device_map = infer_auto_device_map(
46 |             model,
47 |             max_memory={
48 |                 device_1: '38GiB',
49 |                 device_2: '38GiB'
50 |             },
51 |             no_split_module_classes=['Block', 'LlamaDecoderLayer'])
52 | 
53 |         # input and output logits should be on same device
54 |         device_map['model.decoder.lm.lm_head'] = device_1
55 | 
56 |         model = dispatch_model(
57 |             model,
58 |             device_map=device_map).eval()
59 | 
60 |         self.model = model
61 |         kwargs_default = dict(max_new_tokens=512, length_penalty=-1)
62 |         kwargs_default.update(kwargs)
63 |         self.kwargs = kwargs_default
64 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
65 | 
66 |     def generate_inner(self, message, dataset=None):
67 |         query, images = '', []
68 |         for item in message:
69 |             if item['type'] == 'image':
70 |                 images.append(Image.open(item['value']).convert('RGB'))
71 |                 query += '[<IMG_PLH>]'
72 |             elif item['type'] == 'text':
73 |                 query += item['value']
74 | 
75 |         inputs = self.model.build_input_ids(
76 |             text=[query],
77 |             tokenizer=self.tokenizer,
78 |             image=images
79 |         )
80 | 
81 |         with torch.no_grad():
82 |             outputs = self.model.generate(
83 |                 input_ids=inputs['input_ids'],
84 |                 attention_mask=inputs['attention_mask'],
85 |                 image=inputs['image'].to(torch.bfloat16),
86 |                 **self.kwargs)
87 | 
88 |         output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
89 |         return output_text[0]
90 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import os.path as osp
 4 | import sys
 5 | from .base import BaseModel
 6 | from ..smp import *
 7 | 
 8 | 
 9 | class InstructBLIP(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self, name):
15 |         self.config_map = {
16 |             'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 |             'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 |         }
19 | 
20 |         self.file_path = __file__
21 |         config_root = osp.dirname(self.file_path)
22 | 
23 |         try:
24 |             from lavis.models import load_preprocess
25 |             from omegaconf import OmegaConf
26 |             from lavis.common.registry import registry
27 |         except:
28 |             warnings.warn('Please install lavis before using InstructBLIP. ')
29 |             sys.exit(-1)
30 | 
31 |         assert name in self.config_map
32 |         cfg_path = osp.join(config_root, self.config_map[name])
33 |         cfg = OmegaConf.load(cfg_path)
34 | 
35 |         model_cfg = cfg.model
36 |         assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 |         model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 |         model = model_cls.from_config(model_cfg)
39 |         model.eval()
40 | 
41 |         self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 |         device = self.device
43 |         model.to(device)
44 |         self.model = model
45 |         self.kwargs = {'max_length': 512}
46 | 
47 |         preprocess_cfg = cfg.preprocess
48 |         vis_processors, _ = load_preprocess(preprocess_cfg)
49 |         self.vis_processors = vis_processors
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         prompt, image_path = self.message_to_promptimg(message)
53 |         vis_processors = self.vis_processors
54 |         raw_image = Image.open(image_path).convert('RGB')
55 |         image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 |         outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 |         return outputs[0]
58 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next
2 | from .llava_xtuner import LLaVA_XTuner
3 | 
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner']
5 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/minigpt4.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import os.path as osp
 4 | import warnings
 5 | from transformers import StoppingCriteriaList
 6 | from .base import BaseModel
 7 | 
 8 | 
 9 | class MiniGPT4(BaseModel):
10 | 
11 |     INSTALL_REQ = True
12 |     INTERLEAVE = False
13 | 
14 |     def __init__(self,
15 |                  mode='v2',
16 |                  root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
17 |                  temperature=1,
18 |                  max_out_len=512):
19 | 
20 |         if root is None:
21 |             warnings.warn(
22 |                 'Please set root to the directory of MiniGPT-4, which is cloned from here: '
23 |                 'https://github.com/Vision-CAIR/MiniGPT-4. '
24 |             )
25 | 
26 |         if mode == 'v2':
27 |             cfg = 'minigptv2_eval.yaml'
28 |         elif mode == 'v1_7b':
29 |             cfg = 'minigpt4_7b_eval.yaml'
30 |         elif mode == 'v1_13b':
31 |             cfg = 'minigpt4_13b_eval.yaml'
32 |         else:
33 |             raise NotImplementedError
34 | 
35 |         self.mode = mode
36 |         self.temperature = temperature
37 |         self.max_out_len = max_out_len
38 |         self.root = root
39 |         this_dir = osp.dirname(__file__)
40 | 
41 |         self.cfg = osp.join(this_dir, 'misc', cfg)
42 |         sys.path.append(self.root)
43 | 
44 |         from omegaconf import OmegaConf
45 |         from minigpt4.common.registry import registry
46 |         from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
47 | 
48 |         device = torch.cuda.current_device()
49 |         self.device = device
50 | 
51 |         cfg_path = self.cfg
52 |         cfg = OmegaConf.load(cfg_path)
53 | 
54 |         model_cfg = cfg.model
55 |         model_cfg.device_8bit = device
56 |         model_cls = registry.get_model_class(model_cfg.arch)
57 |         model = model_cls.from_config(model_cfg)
58 |         model = model.to(device)
59 |         model.eval()
60 |         vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
61 |         vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
62 |         self.model = model
63 |         self.vis_processor = vis_processor
64 | 
65 |         self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
66 |         stop_words_ids = [[835], [2277, 29937]]
67 |         stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
68 |         self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
69 | 
70 |     def generate_inner(self, message, dataset=None):
71 |         from minigpt4.conversation.conversation import Chat
72 |         prompt, image_path = self.message_to_promptimg(message)
73 |         if self.mode == 'v2':
74 |             chat = Chat(self.model, self.vis_processor, device=self.device)
75 |         else:
76 |             chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
77 | 
78 |         chat_state = self.CONV_VISION.copy()
79 |         img_list = []
80 |         _ = chat.upload_img(image_path, chat_state, img_list)
81 |         chat.encode_img(img_list)
82 |         chat.ask(prompt, chat_state)
83 |         with torch.inference_mode():
84 |             msg = chat.answer(conv=chat_state, img_list=img_list)[0]
85 |         return msg
86 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-13b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # path to Vicuna checkpoint
25 |   llm_model: "Please set the path to your vicuna-7b-v1.1"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip2_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-13b-v0"
25 | 
26 | datasets:
27 |   cc_sbu_align:
28 |     vis_processor:
29 |       train:
30 |         name: "blip2_image_eval"
31 |         image_size: 224
32 |     text_processor:
33 |       train:
34 |         name: "blip_caption"
35 | 
36 | run:
37 |   task: image_text_pretrain
38 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt4
 3 |   model_type: pretrain_vicuna_7b
 4 |   max_txt_len: 160
 5 |   end_sym: "###"
 6 |   low_resource: True
 7 |   prompt_template: '###Human: {} ###Assistant: '
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 | 
10 |   # vit encoder
11 |   image_size: 224
12 |   drop_path_rate: 0
13 |   use_grad_checkpoint: False
14 |   vit_precision: "fp16"
15 |   freeze_vit: True
16 |   freeze_qformer: True
17 | 
18 |   # Q-Former
19 |   num_query_token: 32
20 | 
21 |   # generation configs
22 |   prompt: ""
23 | 
24 |   llama_model: "please set this value to the path of vicuna-7b-v0"
25 | 
26 | 
27 | datasets:
28 |   cc_sbu_align:
29 |     vis_processor:
30 |       train:
31 |         name: "blip2_image_eval"
32 |         image_size: 224
33 |     text_processor:
34 |       train:
35 |         name: "blip_caption"
36 | 
37 | run:
38 |   task: image_text_pretrain
39 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: minigpt_v2
 3 |   model_type: pretrain
 4 |   max_txt_len: 160
 5 |   end_sym: "</s>"
 6 |   low_resource: True
 7 |   prompt_template: '[INST] {} [/INST]'
 8 |   ckpt: "please set this value to the path of pretrained checkpoint"
 9 |   lora_r: 64
10 |   lora_alpha: 16
11 | 
12 |   # vit encoder
13 |   image_size: 448
14 |   drop_path_rate: 0
15 |   use_grad_checkpoint: False
16 |   vit_precision: "fp16"
17 |   freeze_vit: True
18 | 
19 |   # generation configs
20 |   prompt: ""
21 | 
22 |   # LLM
23 |   llama_model: "please set this value to the path of llama2-chat-7b"
24 | 
25 | datasets:
26 |   cc_sbu_align:
27 |     vis_processor:
28 |       train:
29 |         name: "blip2_image_eval"
30 |         image_size: 448
31 |     text_processor:
32 |       train:
33 |         name: "blip_caption"
34 | 
35 | run:
36 |   task: image_text_pretrain
37 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/mmalaya.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | from PIL import Image
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class MMAlaya(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
14 |         assert model_path is not None
15 |         self.model_path = model_path
16 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
17 |         model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
18 |         # need initialize tokenizer
19 |         model.initialize_tokenizer(self.tokenizer)
20 |         self.model = model.cuda()
21 | 
22 |         self.kwargs = kwargs
23 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
24 |         torch.cuda.empty_cache()
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         # read image
28 |         prompt, image_path = self.message_to_promptimg(message)
29 |         image = Image.open(image_path).convert('RGB')
30 |         # tokenize prompt, and proprecess image
31 |         input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
32 |             prompt,
33 |             self.tokenizer,
34 |             image,
35 |             return_tensors='pt')
36 |         with torch.inference_mode():
37 |             output_ids = self.model.generate(
38 |                 inputs=input_ids.cuda(),
39 |                 images=image_tensor.cuda(),
40 |                 do_sample=False,
41 |                 max_new_tokens=512,
42 |                 num_beams=1,
43 |                 use_cache=True,
44 |                 stopping_criteria=[stopping_criteria],
45 |             )
46 |             # truncate input_ids in generate_ids and then decode to text
47 |             input_token_len = input_ids.shape[1]
48 |             response = self.tokenizer.batch_decode(
49 |                 output_ids[:, input_token_len:].cpu(),
50 |                 skip_special_tokens=True,
51 |                 clean_up_tokenization_spaces=False
52 |             )[0].strip()
53 |         return response
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     model = MMAlaya()
58 |     response = model.generate(['./assets/apple.jpg', '请详细描述一下这张图片。'])
59 |     print(response)
60 | 
61 | """
62 | export PYTHONPATH=$PYTHONPATH:/tmp/VLMEvalKit
63 | CUDA_VISIBLE_DEVICES=0 python vlmeval/vlm/mmalaya.py
64 | """
65 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/open_flamingo.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import torch
  3 | from PIL import Image
  4 | import os.path as osp
  5 | import warnings
  6 | from .base import BaseModel
  7 | from ..smp import splitlen, get_cache_path
  8 | from huggingface_hub import snapshot_download
  9 | 
 10 | 
 11 | class OpenFlamingo(BaseModel):
 12 | 
 13 |     INSTALL_REQ = True
 14 |     INTERLEAVE = True
 15 | 
 16 |     def __init__(self,
 17 |                  name,
 18 |                  mpt_pth=None,
 19 |                  ckpt_pth=None,
 20 |                  **kwargs):
 21 | 
 22 |         if mpt_pth is None:
 23 |             warnings.warn(
 24 |                 'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
 25 |                 'https://huggingface.co/mosaicml/mpt-7b. '
 26 |             )
 27 |             sys.exit(-1)
 28 |         if ckpt_pth is None:
 29 |             warnings.warn(
 30 |                 'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
 31 |                 'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
 32 |             )
 33 |             sys.exit(-1)
 34 |         else:
 35 |             if osp.exists(ckpt_pth):
 36 |                 if ckpt_pth.endswith('checkpoint.pt'):
 37 |                     pass
 38 |                 elif osp.isdir(ckpt_pth):
 39 |                     ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
 40 |                     if not osp.exists(ckpt_pth):
 41 |                         sys.exit(-1)
 42 |             elif splitlen(ckpt_pth, '/') == 2:
 43 |                 cache_path = get_cache_path(ckpt_pth)
 44 |                 if cache_path is None:
 45 |                     snapshot_download(ckpt_pth)
 46 |                 cache_path = get_cache_path(ckpt_pth)
 47 |                 if cache_path is None:
 48 |                     sys.exit(-1)
 49 |                 else:
 50 |                     ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
 51 | 
 52 |         self.name = name
 53 |         assert name in ['v2']
 54 |         self.mpt_pth = mpt_pth
 55 |         try:
 56 |             from open_flamingo import create_model_and_transforms
 57 |         except:
 58 |             raise ImportError('Please first install open_flamingo to use OpenFlamingo')
 59 |         model, image_processor, tokenizer = create_model_and_transforms(
 60 |             clip_vision_encoder_path='ViT-L-14',
 61 |             clip_vision_encoder_pretrained='openai',
 62 |             lang_encoder_path=mpt_pth,
 63 |             tokenizer_path=mpt_pth,
 64 |             cross_attn_every_n_layers=4)
 65 |         ckpt = torch.load(ckpt_pth)
 66 |         model.load_state_dict(ckpt, strict=False)
 67 |         torch.cuda.empty_cache()
 68 |         self.model = model.eval().cuda()
 69 |         self.tokenizer = tokenizer
 70 |         self.tokenizer.padding_side = 'left'
 71 |         self.image_proc = image_processor
 72 | 
 73 |         kwargs_default = dict(max_new_tokens=512, num_beams=3)
 74 |         kwargs_default.update(kwargs)
 75 |         self.kwargs = kwargs_default
 76 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
 77 | 
 78 |     def generate_inner(self, message, dataset=None):
 79 |         vision_x = []
 80 |         prompt = ''
 81 |         for msg in message:
 82 |             if msg['type'] == 'image':
 83 |                 img = Image.open(msg['value'])
 84 |                 vision_x.append(self.image_proc(img).unsqueeze(0))
 85 |                 prompt += '<image>'
 86 |             elif msg['type'] == 'text':
 87 |                 prompt += msg['value']
 88 |         prompt += 'Answer: '
 89 |         vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
 90 |         vision_x = vision_x.unsqueeze(1).unsqueeze(0)
 91 |         lang_x = self.tokenizer([prompt], return_tensors='pt')
 92 |         generated_text = self.model.generate(
 93 |             vision_x=vision_x.cuda(),
 94 |             lang_x=lang_x['input_ids'].cuda(),
 95 |             attention_mask=lang_x['attention_mask'].cuda(),
 96 |             **self.kwargs)
 97 |         generated_text = self.tokenizer.decode(generated_text[0])
 98 |         text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
 99 |         return text
100 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/paligemma.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class PaliGemma(BaseModel):
 9 |     INSTALL_REQ = False
10 |     INTERLEAVE = False
11 | 
12 |     def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
13 |         try:
14 |             from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
15 |         except:
16 |             warnings.warn('Please install the latest version transformers.')
17 |             sys.exit(-1)
18 |         model = PaliGemmaForConditionalGeneration.from_pretrained(
19 |             model_path,
20 |             torch_dtype=torch.bfloat16,
21 |             device_map='cpu',
22 |             revision='bfloat16',
23 |         ).eval()
24 |         self.model = model.cuda()
25 |         self.processor = AutoProcessor.from_pretrained(model_path)
26 |         self.kwargs = kwargs
27 | 
28 |     def generate_inner(self, message, dataset=None):
29 |         prompt, image_path = self.message_to_promptimg(message)
30 |         image = Image.open(image_path).convert('RGB')
31 | 
32 |         model_inputs = self.processor(text=prompt, images=image, return_tensors='pt').to('cuda')
33 |         input_len = model_inputs['input_ids'].shape[-1]
34 | 
35 |         with torch.inference_mode():
36 |             generation = self.model.generate(**model_inputs, max_new_tokens=512, do_sample=False)
37 |             generation = generation[0][input_len:]
38 |             res = self.processor.decode(generation, skip_special_tokens=True)
39 |         return res
40 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/pandagpt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import os.path as osp
 4 | import warnings
 5 | from .base import BaseModel
 6 | 
 7 | 
 8 | class PandaGPT(BaseModel):
 9 | 
10 |     INSTALL_REQ = True
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, name, root=None, **kwargs):
14 |         if root is None:
15 |             warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ')
16 |             sys.exit(-1)
17 | 
18 |         assert name == 'PandaGPT_13B'
19 |         self.name = name
20 |         sys.path.append(osp.join(root, 'code'))
21 |         try:
22 |             from model.openllama import OpenLLAMAPEFTModel
23 |         except:
24 |             raise ImportError(
25 |                 'Please first install PandaGPT and set the root path to use PandaGPT, '
26 |                 'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
27 |             )
28 |         self.args = {
29 |             'model': 'openllama_peft',
30 |             'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
31 |             'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
32 |             'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
33 |             'stage': 2,
34 |             'max_tgt_len': 512,
35 |             'lora_r': 32,
36 |             'lora_alpha': 32,
37 |             'lora_dropout': 0.1,
38 |         }
39 |         model = OpenLLAMAPEFTModel(**self.args)
40 |         delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
41 |         model.load_state_dict(delta_ckpt, strict=False)
42 |         torch.cuda.empty_cache()
43 |         self.model = model.eval().half().cuda()
44 |         kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
45 |         kwargs_default.update(kwargs)
46 |         self.kwargs = kwargs_default
47 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
48 | 
49 |     def generate_inner(self, message, dataset=None):
50 |         prompt, image_path = self.message_to_promptimg(message)
51 |         struct = {
52 |             'prompt': prompt,
53 |             'image_paths': [image_path],
54 |             'audio_paths': [],
55 |             'video_paths': [],
56 |             'thermal_paths': [],
57 |             'modality_embeds': []
58 |         }
59 |         struct.update(self.kwargs)
60 |         resp = self.model.generate(struct)
61 |         return resp
62 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/phi3_vision.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | import torch
 3 | 
 4 | from .base import BaseModel
 5 | from ..smp import *
 6 | 
 7 | 
 8 | class Phi3Vision(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
14 |         try:
15 |             from transformers import AutoProcessor, AutoModelForCausalLM
16 |         except:
17 |             warnings.warn('Please install the latest version transformers.')
18 |             sys.exit(-1)
19 |         model = AutoModelForCausalLM.from_pretrained(
20 |             model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
21 |         processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
22 |         self.model = model
23 |         self.processor = processor
24 |         self.kwargs = kwargs
25 | 
26 |     def generate_inner(self, message, dataset=None):
27 |         prompt, image_path = self.message_to_promptimg(message)
28 |         image = Image.open(image_path).convert('RGB')
29 |         messages = [
30 |             {'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
31 |         ]
32 |         prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33 |         inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
34 | 
35 |         generation_args = {
36 |             'max_new_tokens': 500,
37 |             'temperature': 0.0,
38 |             'do_sample': False,
39 |         }
40 |         generation_args.update(self.kwargs)
41 | 
42 |         generate_ids = self.model.generate(
43 |             **inputs,
44 |             eos_token_id=self.processor.tokenizer.eos_token_id,
45 |             **generation_args
46 |         )
47 |         generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
48 |         response = self.processor.batch_decode(
49 |             generate_ids,
50 |             skip_special_tokens=True,
51 |             clean_up_tokenization_spaces=False
52 |         )[0]
53 |         return response
54 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qh_360vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import os.path as osp
 5 | from PIL import Image
 6 | from .base import BaseModel
 7 | from ..smp import *
 8 | from ..utils import DATASET_TYPE
 9 | 
10 | 
11 | class QH_360VL(BaseModel):
12 | 
13 |     INSTALL_REQ = False
14 |     INTERLEAVE = False
15 | 
16 |     def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
17 |         assert model_path is not None
18 |         self.model_path = model_path
19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
20 |         self.model = AutoModelForCausalLM.from_pretrained(model_path,
21 |                                                           torch_dtype=torch.float16,
22 |                                                           low_cpu_mem_usage=True,
23 |                                                           device_map='auto',
24 |                                                           trust_remote_code=True).eval()
25 |         vision_tower = self.model.get_vision_tower()
26 |         vision_tower.load_model()
27 |         vision_tower.to(device='cuda', dtype=torch.float16)
28 |         self.image_processor = vision_tower.image_processor
29 |         self.tokenizer.pad_token = self.tokenizer.eos_token
30 |         self.kwargs = kwargs
31 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
32 |         torch.cuda.empty_cache()
33 | 
34 |     def generate(self, message, dataset=None):
35 | 
36 |         prompt, image_path = self.message_to_promptimg(message)
37 |         print(prompt)
38 |         image = Image.open(image_path).convert('RGB')
39 |         terminators = [
40 |             self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
41 |         ]
42 |         inputs = self.model.build_conversation_input_ids(self.tokenizer,
43 |                                                          query=prompt,
44 |                                                          image=image,
45 |                                                          image_processor=self.image_processor)
46 |         input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
47 |         images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
48 | 
49 |         output_ids = self.model.generate(input_ids=input_ids,
50 |                                          images=images,
51 |                                          do_sample=False,
52 |                                          num_beams=1,
53 |                                          max_new_tokens=512,
54 |                                          eos_token_id=terminators,
55 |                                          use_cache=True)
56 | 
57 |         input_token_len = input_ids.shape[1]
58 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
59 |         response = outputs.strip()
60 | 
61 |         return response
62 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/qwen_vl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | import warnings
 4 | import copy as cp
 5 | from .base import BaseModel
 6 | from ..smp import isimg, listinstr
 7 | from ..utils import DATASET_TYPE
 8 | 
 9 | 
10 | class QwenVL(BaseModel):
11 | 
12 |     INSTALL_REQ = False
13 |     INTERLEAVE = True
14 | 
15 |     def __init__(self, model_path='Qwen/Qwen-VL', **kwargs):
16 |         assert model_path is not None
17 |         self.model_path = model_path
18 |         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
19 |         tokenizer.padding_side = 'left'
20 |         tokenizer.pad_token_id = tokenizer.eod_id
21 |         self.tokenizer = tokenizer
22 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
23 |         default_kwargs = dict(
24 |             do_sample=False,
25 |             num_beams=1,
26 |             max_new_tokens=512,
27 |             min_new_tokens=1,
28 |             num_return_sequences=1,
29 |             use_cache=True,
30 |             output_hidden_states=True,
31 |             pad_token_id=tokenizer.eod_id,
32 |             eos_token_id=tokenizer.eod_id)
33 |         default_kwargs.update(kwargs)
34 |         self.kwargs = default_kwargs
35 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
36 |         torch.cuda.empty_cache()
37 | 
38 |     def adjust_kwargs(self, dataset):
39 |         kwargs = cp.deepcopy(self.kwargs)
40 |         if DATASET_TYPE(dataset) in ['multi-choice', 'Y/N']:
41 |             kwargs['max_new_tokens'] = 32
42 |         elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
43 |             kwargs['max_new_tokens'] = 32
44 |         elif DATASET_TYPE(dataset) == 'VQA':
45 |             if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
46 |                 kwargs['max_new_tokens'] = 100
47 |             elif listinstr(['TextVQA'], dataset):
48 |                 kwargs['max_new_tokens'] = 10
49 |         return kwargs
50 | 
51 |     def generate_inner(self, message, dataset=None):
52 |         if dataset is not None:
53 |             kwargs = self.adjust_kwargs(dataset)
54 |         else:
55 |             kwargs = self.kwargs
56 |         prompt = ''
57 |         for s in message:
58 |             if s['type'] == 'image':
59 |                 prompt += f'<img>{s["value"]}</img>'
60 |             elif s['type'] == 'text':
61 |                 prompt += s['value']
62 |         if dataset is not None and DATASET_TYPE(dataset) == 'VQA':
63 |             prompt += ' Answer:'
64 |         encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest')
65 |         input_ids = encoded.input_ids.to('cuda')
66 |         attention_mask = encoded.attention_mask.to('cuda')
67 | 
68 |         pred = self.model.generate(
69 |             input_ids=input_ids,
70 |             attention_mask=attention_mask,
71 |             **kwargs)
72 |         answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
73 |         return answer
74 | 
75 | 
76 | class QwenVLChat(BaseModel):
77 | 
78 |     INSTALL_REQ = False
79 |     INTERLEAVE = True
80 | 
81 |     def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs):
82 |         assert model_path is not None
83 |         self.model_path = model_path
84 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
85 |         self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
86 |         torch.cuda.empty_cache()
87 |         self.kwargs = kwargs
88 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
89 | 
90 |     def generate_inner(self, message, dataset=None):
91 |         vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message]
92 |         query = self.tokenizer.from_list_format(vl_list)
93 |         response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs)
94 |         return response
95 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from .base import BaseModel
 3 | from ..smp import *
 4 | 
 5 | 
 6 | class VisualGLM(BaseModel):
 7 | 
 8 |     INSTALL_REQ = False
 9 |     INTERLEAVE = False
10 | 
11 |     def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 |         try:
13 |             import sat
14 |         except:
15 |             warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
16 |         assert model_path is not None
17 |         self.model_path = model_path
18 | 
19 |         from transformers import AutoModel
20 |         from transformers import AutoTokenizer
21 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
22 |         model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
23 |         self.model = model
24 |         self.kwargs = kwargs
25 |         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
26 | 
27 |     def generate_inner(self, message, dataset=None):
28 |         prompt, image_path = self.message_to_promptimg(message)
29 |         output, _ = self.model.chat(
30 |             image_path=image_path,
31 |             tokenizer=self.tokenizer,
32 |             query=prompt,
33 |             history=[],
34 |             **self.kwargs
35 |         )
36 |         return output
37 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/wemm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | import sys
 4 | from ..smp import *
 5 | from .base import BaseModel
 6 | from ..utils import DATASET_TYPE
 7 | from transformers import AutoModel, GenerationConfig
 8 | 
 9 | 
10 | class WeMM(BaseModel):
11 |     def __init__(self, model_path='feipengma/WeMM', **kwargs):
12 |         self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
13 |         self.wemm.cuda()
14 |         self.wemm.eval()
15 |         torch.cuda.empty_cache()
16 | 
17 |     def use_custom_prompt(self, dataset):
18 |         assert dataset is not None
19 |         if DATASET_TYPE(dataset) == 'multi-choice':
20 |             return True
21 |         return False
22 | 
23 |     def build_prompt(self, line, dataset=None):
24 |         assert self.use_custom_prompt(dataset)
25 |         assert dataset is None or isinstance(dataset, str)
26 |         tgt_path = self.dump_image(line, dataset)
27 |         question = line['question']
28 |         hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
29 |         if hint is not None:
30 |             question = hint + '\n' + question
31 |         options = {
32 |             cand: line[cand]
33 |             for cand in string.ascii_uppercase
34 |             if cand in line and not pd.isna(line[cand])
35 |         }
36 |         for key, item in options.items():
37 |             question += f'\n{key}. {item}'
38 |         prompt = question
39 | 
40 |         if len(options):
41 |             prompt += (
42 |                 '\n请直接回答选项字母。' if cn_string(prompt) else
43 |                 "\nAnswer with the option's letter from the given choices directly."
44 |             )
45 |         else:
46 |             prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
47 | 
48 |         message = [dict(type='text', value=prompt)]
49 |         message.extend([dict(type='image', value=p) for p in tgt_path])
50 |         return message
51 | 
52 |     def generate_inner(self, message, dataset=None):
53 |         prompt, image_path = self.message_to_promptimg(message)
54 | 
55 |         if dataset == 'HallusionBench':
56 |             prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
57 | 
58 |         gen_config = None
59 |         if dataset == 'MMVet':
60 |             gen_config = GenerationConfig(
61 |                 max_new_tokens=512,
62 |                 do_sample=True,
63 |                 temperatures=0.7,
64 |                 num_beams=3,
65 |                 eos_token_id=self.wemm.tokenizer.eos_token_id,
66 |                 pad_token_id=self.wemm.tokenizer.pad_token_id
67 |                 if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
68 |             )
69 |         pred = self.wemm.mm_generate(image_path, prompt, gen_config)
70 | 
71 |         return pred
72 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 | 
6 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD']
7 | 


--------------------------------------------------------------------------------
/vlmeval/vlm/xcomposer/sharecaptioner.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | from ..base import BaseModel
 4 | from ...smp import *
 5 | from ...utils import DATASET_TYPE
 6 | 
 7 | 
 8 | class ShareCaptioner(BaseModel):
 9 | 
10 |     INSTALL_REQ = False
11 |     INTERLEAVE = False
12 | 
13 |     def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
14 |         assert model_path is not None
15 |         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
16 |         self.model = AutoModelForCausalLM.from_pretrained(
17 |             model_path, device_map='cuda', trust_remote_code=True).eval()
18 |         self.model.tokenizer = tokenizer
19 |         self.model.cuda()
20 |         self.model.half()
21 | 
22 |     def use_custom_prompt(self, dataset):
23 |         assert dataset is not None
24 |         if DATASET_TYPE(dataset) == 'multi-choice':
25 |             return True
26 |         return False
27 | 
28 |     def build_prompt(self, line, dataset=None):
29 |         assert dataset is None or isinstance(dataset, str)
30 |         assert self.use_custom_prompt(dataset)
31 |         tgt_path = self.dump_image(line, dataset)
32 | 
33 |         if dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
34 |             question = line['question']
35 |             hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
36 |             if hint is not None:
37 |                 question = hint + '\n' + question
38 | 
39 |             option_candidate = string.ascii_uppercase
40 |             options = {
41 |                 cand: line[cand]
42 |                 for cand in option_candidate
43 |                 if cand in line and not pd.isna(line[cand])
44 |             }
45 |             for key, item in options.items():
46 |                 question += f'\n{key}. {item}'
47 |             prompt = question
48 | 
49 |             if not cn_string(prompt):
50 |                 prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
51 |             else:
52 |                 prompt = prompt + '\n' + '请直接回答选项字母。'
53 |         else:
54 |             prompt = line['question']
55 |         message = [dict(type='text', value=prompt)]
56 |         message.extend([dict(type='image', value=s) for s in tgt_path])
57 |         return message
58 | 
59 |     def generate_inner(self, message, dataset=None):
60 |         prompt, image_path = self.message_to_promptimg(message)
61 |         seg1 = '<|User|>:'
62 |         seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
63 |         self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
64 |         self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
65 | 
66 |         image = Image.open(image_path).convert('RGB')
67 |         image = self.model.vis_processor(image).unsqueeze(0)
68 |         image = image.to(self.model.device)
69 |         tmp_bs = image.shape[0]
70 |         tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
71 |         tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
72 |         with torch.cuda.amp.autocast():
73 |             with torch.no_grad():
74 |                 image = self.model.encode_img(image)
75 |                 input_emb = torch.cat(
76 |                     [tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
77 |                 out_embeds = self.model.internlm_model.generate(
78 |                     inputs_embeds=input_emb,
79 |                     max_length=500,
80 |                     num_beams=3,
81 |                     min_length=1,
82 |                     do_sample=True,
83 |                     repetition_penalty=1.5,
84 |                     length_penalty=1.0,
85 |                     temperature=1.,
86 |                     eos_token_id=self.model.tokenizer.eos_token_id,
87 |                     num_return_sequences=1)
88 | 
89 |         for j, out in enumerate(out_embeds):
90 |             out[out == -1] = 2
91 |             response = self.model.decode_text([out])
92 |         return response
93 | 


--------------------------------------------------------------------------------