├── .gitignore
├── LICENSE
├── README.md
├── README_more_samples.md
├── inference
├── bard.py
├── bard.sh
├── claude.py
├── gemini_vision.py
├── gpt4v.py
├── qwen.py
└── utils.py
├── mm-vet_evaluator.ipynb
├── mm-vet_evaluator.py
├── results
└── llava_llama2_13b_chat.json
└── v2
├── README.md
├── inference
├── claude.py
├── cogagent.py
├── cogvlm.py
├── emu2.py
├── gemini.py
├── gpt4.py
├── internvl.py
├── internvl2.py
├── ixc2.py
├── open_flamingo.py
├── qwen.py
└── utils.py
├── mm-vet-v2_evaluator.py
└── results
├── claude-3-5-sonnet-20240620.json
├── gemini-1.5-pro.json
└── gpt-4o-2024-05-13_detail-high.json
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # [MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490) (ICML 2024)
7 | # [MM-Vet v2](v2/)
8 |
9 |
10 | [Paper]
11 | [Download Dataset]
12 | [Dataset on Hugging Face]
13 | [Leaderboard]
14 | [Online Evaluator]
15 |
16 |
17 |
18 | 2024/08/02: :fire: :fire: We release [**MM-Vet v2**](v2/), the extension of MM-Vet, which includes a new vision-langauge capability called "image-text sequence understanding", and expands the evaluation set size while maintaining the high quality.
19 |
20 |
21 | 2024/03/17: :fire: :fire: We release inferences scripts for Qwen-VL and Claude. Qwen-VL-Max and Claude 3 Opus achieve 66.6% and 58.1%, respectively.
22 |
23 |
24 | 2023/12/23: :fire: :fire: We release inferences scripts for GPT-4V and Gemini. Gemini Pro Vision achieves 64.3% score.
25 |
26 |
27 | 2023/10/24 :fire: :fire: We evaluate GPT-4V on MM-Vet and observe that it achieves 67.7% score, outperforming other methods with large margin (20%). However, it still has a large gap to the full mark (100%), indicating the need for efforts to further improve the integrated capabilities of LMMs. See [leaderboard](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet), [updated paper](https://arxiv.org/abs/2308.02490) and [GPT-4V prediction examples](#gpt-4v-prediction-examples).
28 |
29 |
30 | 2023/10/07 :fire: :fire: **We released [MM-Vet leaderboard](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) on paperswithcode.com where you can add your model results conveniently.** Note that date here means model date instead of paper date because some improved model versions are released after the paper.
31 |
32 | In this repo, we offer data and evaluator of MM-Vet, proposed by our paper "[MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490)". The code is under the Apache 2.0 license, and the dataset is under the CC BY-NC 4.0 license.
33 |
34 |
35 | 
36 | Figure 1: Different from conventional VL benchmarks only require one or two capabilities, MM-Vet focuses on the integration of different core VL capabilities, including recognition, OCR, knowledge, language generation, spatial awareness, and math.
37 |
38 | ## Evalute your model on MM-Vet
39 | **Step 0**: Install openai package with `pip install openai>=1` and get access GPT-4/GPT-3.5 API. If you have not access, you can try MM-Vet online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator) (but it may wait for long time depending on number of users).
40 |
41 | **Step 1**: Download MM-Vet data [here](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) and unzip `unzip mm-vet.zip`.
42 |
43 | **Step 2**: Infer your model on MM-Vet and save your model outputs in json like [llava_llama2_13b_chat.json](results/llava_llama2_13b_chat.json), or just use [llava_llama2_13b_chat.json](results/llava_llama2_13b_chat.json) as example to evalute. We also release inference scripts for GPT-4V and Gemini.
44 |
45 | ```bash
46 | image_detail=high # or auto, low refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding
47 |
48 | python inference/gpt4v.py --mmvet_path /path/to/mm-vet --image_detail ${image_detail}
49 | ```
50 |
51 | ```bash
52 | python inference/gemini_vision.py --mmvet_path /path/to/mm-vet
53 | ```
54 |
55 | **Step 3**: `git clone https://github.com/yuweihao/MM-Vet.git && cd MM-Vet`, run LLM-based evaluator in [mm-vet_evaluator.ipynb](mm-vet_evaluator.ipynb) or [mm-vet_evaluator.py](mm-vet_evaluator.py) (Thanks to @HireTheHero to arrange it into py version).
56 | ```bash
57 | python mm-vet_evaluator.py --mmvet_path /path/to/mm-vet --result_file results/llava_llama2_13b_chat.json
58 | ```
59 | If you cannot access GPT-4 (gpt-4-0613), you can upload your model output results (json file) to MM-Vet online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator) to get the grading results.
60 |
61 |
62 | ## Citation
63 | ```
64 | @inproceedings{yu2024mm,
65 | title={Mm-vet: Evaluating large multimodal models for integrated capabilities},
66 | author={Yu, Weihao and Yang, Zhengyuan and Li, Linjie and Wang, Jianfeng and Lin, Kevin and Liu, Zicheng and Wang, Xinchao and Wang, Lijuan},
67 | booktitle={International conference on machine learning},
68 | year={2024},
69 | organization={PMLR}
70 | }
71 | ```
72 |
73 | ## GPT-4V Prediction Examples
74 | 
75 |
76 | 
77 |
78 | 
79 |
80 |
81 |
82 | ## About running Bard
83 | Please refer to these two files: [inference_bard.sh](misc/inference_bard.sh) and [inference_bard.py](misc/inference_bard.py).
84 |
85 |
86 | ## Some samples on MM-Vet
87 | 
88 |
89 | **Q**: What occasions would someone use this meme?
90 |
91 | **GT**: This meme, commonly known as "Screaming Panda," is typically used to express shock, surprise, or fear. It could be used in response to a startling or unexpected event, or to convey a sense of panic or alarm. Some possible occasions where someone might use this meme include:
92 |
93 | - Reacting to a jump scare in a horror movie
94 | - Responding to a surprising plot twist in a TV show or book
95 | - Expressing shock at a news headline or current event
96 | - Conveying fear or anxiety about an upcoming deadline or exam
97 | - Showing surprise at an unexpected outcome in a sports game or other competition.
98 |
99 | **Required capabilities**: Recognition, knowledge, language generation
100 |
101 | ---
102 |
103 | 
104 |
105 | **Q**: How many tomatoes are there?
106 |
107 | **GT**: 5
108 |
109 | **Required capabilities**: Recognition
110 |
111 | ---
112 |
113 | 
114 |
115 | **Q**: What is located to the right of the shampoo?
116 |
117 | **GT**: conditioner
118 |
119 | **Required capabilities**: OCR, spatial awareness
120 |
121 | ---
122 |
123 | 
124 |
125 | **Q**: Which room is bigger, the double garage or the living room?
126 |
127 | **GT**: double garage
128 |
129 | **Required capabilities**: OCR, spatial awareness, math
130 |
131 | ---
132 |
133 | 
134 |
135 | **Q**: On the right desk, what is to the left of the laptop?
136 |
137 | **GT**: table lamp \ desk lamp
138 |
139 | **Required capabilities**: Recognition, spatial awareness
140 |
141 | ---
142 |
143 | 
144 |
145 | **Q**: What are all the scene text in the image?
146 |
147 | **GT**: 5:30PM\88%\Mario Kart 8 Deluxe\MARIO KART 8 DELUXE\SUPER MARIO ODYSSEY\THE LEGEND OF ZELDA\BREATH OF WILD\Options\Start
148 |
149 | **Required capabilities**: OCR
150 |
151 | ---
152 |
153 | 
154 |
155 | **Q**: How many gallons of supreme gasoline can I get with $50?
156 |
157 | **GT**: 13.6 \ 13.7
158 |
159 | **Required capabilities**: OCR, math
160 |
161 | ---
162 |
163 | 
164 |
165 | **Q**: In which country was this photo taken?
166 |
167 | **GT**: Australia
168 |
169 | **Required capabilities**: Recognition, knowledge
170 |
171 | ---
172 |
173 | 
174 |
175 | **Q**: Can you explain this meme?
176 |
177 | **GT**: This meme is a humorous take on procrastination and the tendency to delay tasks until a specific time. The person in the meme plans to do something at 8 o'clock, but when they miss that deadline by a few minutes, they decide to wait until 9 o'clock instead. The image of Kermit the Frog lying in bed represents the person's laziness and lack of motivation to complete the task.
178 |
179 | **Required capabilities**: Recognition, OCR, knowledge, language generation
180 |
181 | ---
182 |
183 | 
184 |
185 | **Q**: The graph below shows the long-term international migration, UK, 1999-2008.
186 |
187 | Summarize the information by selecting and reporting the main features, and make comparisons where relevant.
188 |
189 | You should write at least 150 words.
190 |
191 | **GT**: The chart gives information about UK immigration, emigration and net migration between 1999 and 2008.
192 |
193 | Both immigration and emigration rates rose over the period shown, but the figures for immigration were significantly higher. Net migration peaked in 2004 and 2007.
194 |
195 | In 1999, over 450,000 people came to live in the UK, while the number of people who emigrated stood at just under 300,000. The figure for net migration was around 160,000, and it remained at a similar level until 2003. From 1999 to 2004, the immigration rate rose by nearly 150,000 people, but there was a much smaller rise in emigration. Net migration peaked at almost 250,000 people in 2004.
196 |
197 | After 2004, the rate of immigration remained high, but the number of people emigrating fluctuated. Emigration fell suddenly in 2007, before peaking at about 420,000 people in 2008. As a result, the net migration figure rose to around 240,000 in 2007, but fell back to around 160,000 in 2008.
198 |
199 | **Required capabilities**: Recognition, OCR, language generation, spatial awareness
200 |
201 | ---
202 |
203 | 
204 |
205 | **Q**: Which car is on the parking spot 33?
206 |
207 | **GT**: no \ empty
208 |
209 | **Required capabilities**: Recognition, OCR, spatial awareness
210 |
211 | ---
212 |
213 |
214 | 
215 |
216 | **Q**: Is this apple organic?
217 |
218 | **GT**: yes
219 |
220 | **Required capabilities**: Recognition, OCR
221 |
222 | ---
223 |
224 | 
225 |
226 | **Q**: Which are producers in this food web?
227 |
228 | **GT**: Phytoplankton \ Seaweed
229 |
230 | **Required capabilities**: OCR, knowledge, spatial awareness
231 |
232 | ---
233 |
234 | 
235 |
236 | **Q**: Is the person bigger than the car?
237 |
238 | **GT**: no
239 |
240 | **Required capabilities**: Recognition, knowledge, spatial awareness
241 |
242 | ---
243 |
244 | 
245 |
246 | **Q**: The table below gives information about the underground railway systems in six cities.
247 |
248 | Summarise the information by selecting and reporting the main features, and make comparisons where relevant.
249 |
250 | You should write at least 150 words.
251 |
252 | **GT**: The table shows data about the underground rail networks in six major cities.
253 |
254 | The table compares the six networks in terms of their age, size and the number of people who use them each year. It is clear that the three oldest underground systems are larger and serve significantly more passengers than the newer systems.
255 |
256 | The London underground is the oldest system, having opened in 1863. It is also the largest system, with 394 kilometres of route. The second largest system, in Paris, is only about half the size of the London underground, with 199 kilometres of route. However, it serves more people per year. While only third in terms of size, the Tokyo system is easily the most used, with 1927 million passengers per year.
257 |
258 | Of the three newer networks, the Washington DC underground is the most extensive, with 126 kilometres of route, compared to only 11 kilometres and 28 kilometres for the Kyoto and Los Angeles systems. The Los Angeles network is the newest, having opened in 2001, while the Kyoto network is the smallest and serves only 45 million passengers per year.
259 |
260 | **Required capabilities**: OCR, language generation, spatial awareness
261 |
262 | ---
263 |
264 | 
265 |
266 | **Q**: What will the girl on the right write on the board?
267 |
268 | **GT**: 14
269 |
270 | **Required capabilities**: Recognition, OCR, spatial awareness, math
271 |
272 |
273 | More samples are shown [here](README_more_samples.md).
--------------------------------------------------------------------------------
/README_more_samples.md:
--------------------------------------------------------------------------------
1 | ## More examples in MM-Vet
2 |
3 | 
4 |
5 | **Q**: What is x in the equation?
6 |
7 | **GT**: -1 \ -5
8 |
9 | **Required capabilities**: OCR, math
10 |
11 | ---
12 |
13 | 
14 |
15 | **Q**: What is d in the last equation?
16 |
17 | **GT**: 1.25 \ 5/4
18 |
19 | **Required capabilities**: OCR, math
20 |
21 | ---
22 |
23 | 
24 |
25 | **Q**: What is the total price for a bottle of Merlot and a bottle of Cabernet shown in the image?
26 |
27 | **GT**: 249.98
28 |
29 | **Required capabilities**: OCR, spatial awareness, math
30 |
31 | ---
32 |
33 | 
34 |
35 | **Q**: I am getting one funghi pizza and one prosciutto pizza. How much should I pay in total?
36 |
37 | **GT**: 14.8
38 |
39 | **Required capabilities**: OCR, math
40 |
41 | ---
42 |
43 | 
44 |
45 | **Q**: What is the price for tomatoes?
46 |
47 | **GT**: eight \ 8.0
48 |
49 | **Required capabilities**: recognition, OCR, spatial awareness
50 |
51 | ---
52 |
53 | 
54 |
55 | **Q**: What earth's interior structure does number 2 indicate?
56 |
57 | **GT**: mantle
58 |
59 | **Required capabilities**: OCR, knowledge, spatial awareness
60 |
61 | ---
62 |
63 | 
64 |
65 | **Q**: Should I add sugar when mixing egg and milk?
66 |
67 | **GT**: no
68 |
69 | **Required capabilities**: recognition, OCR, spatial awareness
70 |
71 | ---
72 |
73 | 
74 |
75 | **Q**: What should we add in the third step?
76 |
77 | **GT**: milk
78 |
79 | **Required capabilities**: recognition, OCR, spatial awareness
80 |
81 | ---
82 |
83 | 
84 |
85 | **Q**: What is the difference in metric tonnes between the amount of plastic produced and the amount discarded?
86 |
87 | **GT**: 105
88 |
89 | **Required capabilities**: OCR, spatial awareness, math
90 |
91 | ---
92 |
93 | 
94 |
95 | **Q**: What is the estimated average standing charge for the year 2023?
96 |
97 | **GT**: 271
98 |
99 | **Required capabilities**: OCR, spatial awareness, math
100 |
101 | ---
102 |
103 | 
104 |
105 | **Q**: What are the appliances in the kitchen in this floorplan?
106 |
107 | **GT**: oven \ dishwasher
108 |
109 | **Required capabilities**: OCR, spatial awareness
110 |
111 |
112 | ---
113 |
114 | 
115 |
116 | **Q**: What is the solution if the lamp is plugged in and the bulb has burned out?
117 |
118 | **GT**: replace blub
119 |
120 | **Required capabilities**: OCR, spatial awareness
121 |
122 | ---
123 |
124 | 
125 |
126 | **Q**: What is the average wait time to climb out of bed?
127 |
128 | **GT**: 15 min
129 |
130 | **Required capabilities**: OCR, spatial awareness, math
131 |
132 | ---
133 |
134 | 
135 |
136 | **Q**: What is Japan's gdp in 2012?
137 |
138 | **GT**: 6,233.15 billion U.S. dollars
139 |
140 | **Required capabilities**: OCR, spatial awareness
141 |
142 | ---
143 |
144 | 
145 |
146 | **Q**: In which years did rowing and athletics have the same number of gold medals?
147 |
148 | **GT**: 2000 \ 2012
149 |
150 | **Required capabilities**: OCR, spatial awareness
151 |
152 | ---
153 |
154 | 
155 |
156 | **Q**: Which department is the person who has the highest salary from?
157 |
158 | **GT**: Administration
159 |
160 | **Required capabilities**: OCR, spatial awareness
161 |
162 | ---
163 |
164 | 
165 |
166 | **Q**: What percentage does salary contribute to total income?
167 |
168 | **GT**: 56.2
169 |
170 | **Required capabilities**: OCR, spatial awareness, math
171 |
172 | ---
173 |
174 | 
175 |
176 | **Q**: Who is the person on the left?
177 |
178 | **GT**: Keira Knightley
179 |
180 | **Required capabilities**: recognition, spatial awareness
181 |
182 | ---
183 |
184 | 
185 |
186 | **Q**: Which one is unsweetened?
187 |
188 | **GT**: oat
189 |
190 | **Required capabilities**: OCR, spatial awareness
191 |
192 | ---
193 |
194 | 
195 |
196 | **Q**: What are the calories for the entire bag?
197 |
198 | **GT**: 275
199 |
200 | **Required capabilities**: OCR, math
201 |
202 | ---
203 |
204 | 
205 |
206 | **Q**: What is the name of this dish?
207 |
208 | **GT**: pad thai
209 |
210 | **Required capabilities**: recognition
211 |
212 | ---
213 |
214 | 
215 |
216 | **Q**: What is the make of the car on the left?
217 |
218 | **GT**: volkswagen
219 |
220 | **Required capabilities**: recognition, OCR, spatial awareness
221 |
222 | ---
223 |
224 | 
225 |
226 | **Q**: What is the name of this landmark?
227 |
228 | **GT**: baochu pagoda
229 |
230 | **Required capabilities**: recognition
231 |
232 |
233 | ---
234 |
235 | 
236 |
237 | **Q**: What is the face value of this banknote?
238 |
239 | **GT**: five
240 |
241 | **Required capabilities**: recognition, knowledge
242 |
243 |
244 | ---
245 |
246 | 
247 |
248 | **Q**: What type of currency does this banknote represent?
249 |
250 | **GT**: Swede \ Kronor
251 |
252 | **Required capabilities**: recognition
253 |
254 | ---
255 |
256 | 
257 |
258 | **Q**: Which chest xray disease can be found? (Options include: Atelectasis; Cardiomegaly; Effusion; Infiltration; Mass; Nodule; Pneumonia; Pneumothorax; Consolidation; Edema; Emphysema; Fibrosis; Pleural Thickening; Hernia.)
259 |
260 | **GT**: Cardiomegaly
261 |
262 | **Required capabilities**: recognition, knowledge
263 |
264 | ---
265 |
266 | 
267 |
268 | **Q**: How many books are there?
269 |
270 | **GT**: 63
271 |
272 | **Required capabilities**: recognition
273 |
274 | ---
275 |
276 | 
277 |
278 | **Q**: What is all the scene text in the image?
279 |
280 | **GT**: MR. WILLIAM SHAKESPEARES \ COMEDIES, HISTORIES, & TRAGEDIES.\ publifhed according to the True Originall Copies.
281 |
282 | **Required capabilities**: OCR
283 |
284 | ---
285 |
286 | 
287 |
288 | **Q**: What is the brand of this device?
289 |
290 | **GT**: MOTOROLA
291 |
292 | **Required capabilities**: OCR
293 |
294 | ---
295 |
296 | 
297 |
298 | **Q**: What are all the French scene text in the image?
299 |
300 | **GT**: HISTOIRE DE FRANCE \ De La préhistoire à ans jours \ Queelle Histoire
301 |
302 | **Required capabilities**: OCR
303 |
304 | ---
305 |
306 | 
307 |
308 | **Q**: Which continent is highlighted?
309 |
310 | **GT**: Asia
311 |
312 | **Required capabilities**: recognition, knowledge
313 |
314 | ---
315 |
316 | 
317 |
318 | **Q**: Where is this photo taken?
319 |
320 | **GT**: qatar
321 |
322 | **Required capabilities**: recognition, knowledge
323 |
324 | ---
325 |
326 | 
327 |
328 | **Q**: Are the trees taller than the giraffes?
329 |
330 | **GT**: no
331 |
332 | **Required capabilities**: recognition
333 |
334 | ---
335 |
336 | 
337 |
338 | **Q**: Is this airplane taking off or landing?
339 |
340 | **GT**: taking off
341 |
342 | **Required capabilities**: recognition
343 |
344 | ---
345 |
346 | 
347 |
348 | **Q**: Does the giraffe appear dangerous?
349 |
350 | **GT**: no
351 |
352 | **Required capabilities**: recognition
353 |
354 | ---
355 |
356 | 
357 |
358 | **Q**: Is there any reflection of zebra in water?
359 |
360 | **GT**: yes
361 |
362 | **Required capabilities**: recognition
363 |
364 | ---
365 |
366 | 
367 |
368 | **Q**: Are all of the cats the same color?
369 |
370 | **GT**: no
371 |
372 | **Required capabilities**: recognition
373 |
374 |
375 | ---
376 |
377 | 
378 |
379 | **Q**: Are there napkins under the utensil to the left of the rice?
380 |
381 | **GT**: yes
382 |
383 | **Required capabilities**: recognition, spatial awareness
384 |
385 | ---
386 |
387 | 
388 |
389 | **Q**: Is the surfer to the left or to the right of the surfboard?
390 |
391 | **GT**: left
392 |
393 | **Required capabilities**: recognition, spatial awareness
394 |
395 | ---
396 |
397 | 
398 |
399 | **Q**: Why are the persons wearing black clothes? Answer it and give the rationale.
400 |
401 | **GT**: They are going to a funeral. I think so because it is traditional for people to wear black to a funeral.
402 |
403 | **Required capabilities**: recognition, knowledge, language generation
404 |
405 | ---
406 |
407 | 
408 |
409 | **Q**: Is the woman in red going to be mad at the man, for looking at the woman in dress? Answer it and give the rationale.
410 |
411 | **GT**: Yes, the woman in red and the man appear to be a couple and the woman in red would not appreciate the man checking out other women. I think so because in a monogamous relationship, partners are expected to be faithful to one another.
412 |
413 | **Required capabilities**: recognition, knowledge, language generation
414 |
415 |
416 | ---
417 |
418 | 
419 |
420 | **Q**: Can you explain this meme?
421 |
422 | **GT**: This meme is a commentary on the current situation of online learning due to the COVID-19 pandemic. It shows how different parties are affected by it.
The first two images show the Zoom company and the teachers throwing money in the air, which implies that they are profiting from the situation. Zoom, in particular, has seen a surge in usage and revenue due to the pandemic.
The third image shows the WiFi company also throwing money in the air, which implies that they are also profiting from the situation as more people are using the internet for online learning.
The last image shows a student with a sad clown face, which implies that students are not benefiting from the situation and are struggling with online learning. The \"Put on a happy face\" sign in the background further emphasizes the idea that students are putting on a brave face despite their struggles.
423 |
424 | **Required capabilities**: recognition, OCR, knowledge, language generation
425 |
426 | ---
427 |
428 | 
429 |
430 | **Q**: What is funny about this image?
431 |
432 | **GT**: It is a cartoon of a rhinoceros painting a picture and each picture has its rhino horn because the rhino horn grows in front of its eyes. The caption \"You see the world as you are!\" is a playful commentary on how our perspective shapes our perception of the world.
433 |
434 | **Required capabilities**: recognition, OCR, knowledge, language generation
435 |
436 |
437 | ---
438 |
439 | 
440 |
441 | **Q**: How to cook this dish?
442 |
443 | **GT**: The dish is called \"Hungarian Mushroom Soup\". This Hungarian mushroom soup has lots of flavor and is fairly quick to make. It's primarily a mushroom soup but derives a lot of its flavor from other ingredients. My family loves soup and this is one of their favorites.
Prep Time: 15 mins
Cook Time: 35 mins
Total Time: 50 mins
Servings: 6
Ingredients
4 tablespoons unsalted butter
2 cups chopped onions
1 pound fresh mushrooms, sliced
2 cups chicken broth
1 tablespoon soy sauce
1 tablespoon paprika
2 teaspoons dried dill weed
1 cup milk
3 tablespoons all-purpose flour
½ cup sour cream
¼ cup chopped fresh parsley
2 teaspoons lemon juice
1 teaspoon salt
ground black pepper to taste
Directions
Step1
Melt butter in a large pot over medium heat. Add onions; cook and stir until softened, about 5 minutes. Add mushrooms and sauté for 5 more minutes. Stir in broth, soy sauce, paprika, and dill; reduce heat to low, cover, and simmer for 15 minutes.
Step2
Whisk milk and flour together in a separate bowl; stir into soup until blended. Cover and simmer for 15 more minutes, stirring occasionally.
Step3
Add sour cream, parsley, lemon juice, salt, and ground black pepper; stir over low heat until warmed through, about 3 to 5 minutes. Serve immediately.
444 |
445 | **Required capabilities**: recognition, knowledge, language generation
446 |
447 | ---
448 |
449 | 
450 |
451 | **Q**: How to make this?
452 |
453 | **GT**: This is called \"Sparkling Cantaloupe Agua Fresca\". This refreshing agua fresca with cantaloupe and lime juice topped with club soda is perfect on a hot day.
Prep Time: 15 mins
Total Time: 15 mins
Servings: 8
Ingredients
2 pounds fresh cantaloupe cubes (6 cups)
2 cups water
6 tablespoons freshly squeezed lime juice (from 2 limes)
3 tablespoons honey
1/4 teaspoon salt
2 cups club soda, chilled
cantaloupe melon balls or cubes, for garnish
lime zest twists, for garnish
Directions
Step 1
Working in batches if needed, blend cantaloupe, water, lime juice, honey, and salt in a blender until smooth. Press through a sieve; discard solids. Add strained juice to a pitcher.
Step 2
Chill until ready to serve. Add club soda just before serving. Serve over ice, and garnish glasses with melon balls and lime zest twists.
454 |
455 | **Required capabilities**: recognition, knowledge, language generation
456 |
457 | ---
458 |
459 | 
460 |
461 | **Q**: Can you give a short introduction to this movie?
462 |
463 | **GT**: The Shawshank Redemption is a 1994 American drama film written and directed by Frank Darabont, based on the 1982 Stephen King novella Rita Hayworth and Shawshank Redemption. The film tells the story of banker Andy Dufresne (Tim Robbins), who is sentenced to life in Shawshank State Penitentiary for the murders of his wife and her lover, despite his claims of innocence. Over the following two decades, he befriends a fellow prisoner, contraband smuggler Ellis \"Red\" Redding (Morgan Freeman), and becomes instrumental in a money laundering operation led by the prison warden Samuel Norton (Bob Gunton). William Sadler, Clancy Brown, Gil Bellows, and James Whitmore appear in supporting roles.
464 |
465 | **Required capabilities**: recognition, knowledge, language generation
466 |
467 |
468 | 
469 |
470 | **Q**: Can you give a short introduction to this painting?
471 |
472 | **GT**: Arrangement in Grey and Black No. 1, best known under its colloquial name Whistler's Mother or Portrait of Artist's Mother, is a painting in oils on canvas created by the American-born painter James Abbott McNeill Whistler in 1871. The subject of the painting is Whistler's mother, Anna McNeill Whistler. The painting is 56.81 by 63.94 inches (1,443 mm × 1,624 mm), displayed in a frame of Whistler's own design. It is held by the Musée d'Orsay in Paris, having been bought by the French state in 1891. It is one of the most famous works by an American artist outside the United States. It has been variously described as an American icon and a Victorian Mona Lisa.
473 |
474 | **Required capabilities**: recognition, knowledge, language generation
475 |
476 | ---
477 |
478 | 
479 |
480 | **Q**: What is this disease (give short introduction) and how to treat it?
481 |
482 | **GT**: It is club root. Affecting most brassica crops (cabbage, broccoli, cauliflower, etc.), club root is a serious plant disease in North American home gardens. It is caused by the soil-borne fungus Plasmodiophora brassicae which infects susceptible plants through root hairs. Diseased roots become swollen, misshapen and deformed (clubbed) often cracking and rotting. As a result, plants have difficulty absorbing water and nutrients properly.
Plants often grow poorly and wilt during the heat of the day; plants often revive during cool nights. Outer leaves may turn yellow, purple or brown. Club root will reduce yields and can cause total crop failure.
Fungal spores can be spread by wind, water and garden tools. Disease development can occur over a wide range of conditions, but is favored by excessive moisture, low soil pH and soil temperatures between 64 and 77˚F. Spores can survive in the soil for as many as 10 years.
Treatment
1. Fungicides will NOT treat this soil-dwelling micro-organism.
2. Choose resistant cultivars when possible.
3. Try to prevent the occurrence of this disease by keeping a clean garden and rotating crops.
4. Keep in mind that the disease spores can persist in the soil for up to 20 years. If club root is present you may want to solarize the soil.*
5. Control susceptible weeds \u2014 mustard, radish, shepherd\u2019s purse \u2014 that may be infected to reduce potential buildup of the disease.
6. Carefully remove infected plants and sterilize garden tools (one part bleach to 4 parts water) after use.
7. Raise your soil\u2019s pH to a more alkaline 7.2 by mixing oyster shell or dolomite lime into your garden in the fall. Simple and affordable soil test kits are available to check pH often.
483 |
484 | **Required capabilities**: recognition, knowledge, language generation
485 |
486 | ---
487 |
488 | 
489 |
490 | **Q**: What is this disease (give short introduction) and how to treat it?
491 |
492 | **GT**: Joshua Abraham Norton (February 4, 1818 – January 8, 1880), known as Emperor Norton, was a resident of San Francisco, California, who in 1859 proclaimed himself \"Norton I., Emperor of the United States\". In 1863, after Napoleon III invaded Mexico, he took the secondary title of \"Protector of Mexico\".
493 |
494 | **Required capabilities**: recognition, knowledge, language generation
495 |
496 | ---
497 |
498 | 
499 |
500 | **Q**: The graph and table below give information about water use worldwide and water consumption in two different countries.
Summarise the information by selecting and reporting the main features, and make comparisons where relevant.
You should write at least 150 words.
501 |
502 | **GT**: The charts compare the amount of water used for agriculture, industry and homes around the world, and water use in Brazil and the Democratic Republic of Congo.
It is clear that global water needs rose significantly between 1900 and 2000, and that agriculture accounted for the largest proportion of water used. We can also see that water consumption was considerably higher in Brazil than in the Congo.
In 1900, around 500km³ of water was used by the agriculture sector worldwide. The figures for industrial and domestic water consumption stood at around one fifth of that amount. By 2000, global water use for agriculture had increased to around 3000km³, industrial water use had risen to just under half that amount, and domestic consumption had reached approximately 500km³.
In the year 2000, the populations of Brazil and the Congo were 176 million and 5.2 million respectively. Water consumption per person in Brazil, at 359m³, was much higher than that in the Congo, at only 8m³, and this could be explained by the fact that Brazil had 265 times more irrigated land.
503 |
504 | **Required capabilities**: recognition, OCR, language generation, spatial awareness
505 |
506 | ---
507 |
508 | 
509 |
510 | **Q**: Chorleywood is a village near London whose population has increased steadily since the middle of the nineteenth century. The map below shows the development of the village.
Write a report for a university lecturer describing the development of the village.
You should write at least 150 words.
511 |
512 | **GT**: The map shows the growth of a village called Chorleywood between 1868 and 1994.
It is clear that the village grew as the transport infrastructure was improved. Four periods of development are shown on the map, and each of the populated areas is near to the main roads, the railway or the motorway.
From 1868 to 1883, Chorleywood covered a small area next to one of the main roads. Chorleywood Park and Golf Course is now located next to this original village area. The village grew along the main road to the south between 1883 and 1922, and in 1909 a railway line was built crossing this area from west to east. Chorleywood station is in this part of the village.
The expansion of Chorleywood continued to the east and west alongside the railway line until 1970. At that time, a motorway was built to the east of the village, and from 1970 to 1994, further development of the village took place around motorway intersections with the railway and one of the main roads.
513 |
514 | **Required capabilities**: recognition, OCR, language generation, spatial awareness
--------------------------------------------------------------------------------
/inference/bard.py:
--------------------------------------------------------------------------------
1 | """
2 | Install bardapi by
3 | ```bash
4 | pip install bardapi
5 | ```
6 |
7 | Run this script by
8 | ```bash
9 | while true; do python bard.py; sleep 60; done
10 | ```
11 | Currently we have to use loop in bash instead of python because the Bard-API seems have a bug.
12 |
13 | Remember to change your image folder and meta data path in this script.
14 | """
15 |
16 | import pandas as pd
17 | import os
18 | import json
19 | import time
20 | from bardapi import Bard
21 |
22 |
23 | BARD_TOKEN = "YOUR_TOKEN_HERE" # https://github.com/dsdanielpark/Bard-API#authentication
24 |
25 | model_name = "bard"
26 | bard_error = "Temporarily unavailable due to traffic or an error in cookie values."
27 |
28 | # change the path to your own path
29 | results_path = f'../results/{model_name}.json' # path to save the results
30 | image_folder = f"/path/to/mm-vet/images"
31 | meta_data = "/path/to/mm-vet/mm-vet.json"
32 |
33 |
34 | with open(meta_data, 'r') as f:
35 | data = json.load(f)
36 |
37 |
38 | if os.path.exists(results_path):
39 | with open(results_path, 'r') as f:
40 | results = json.load(f)
41 | else:
42 | results = {}
43 |
44 | data_num = len(data)
45 |
46 | # time.sleep(60)
47 | for i in range(len(data)):
48 | id = f"v1_{i}"
49 | if id in results and not (bard_error in results[id]):
50 | continue
51 | # time.sleep(60)
52 | imagename = data[id]['imagename']
53 | img_path = os.path.join(image_folder, imagename)
54 | prompt = data[id]['question']
55 | prompt = prompt.strip()
56 | print(f"\nPrompt: {prompt}")
57 | # load sample image
58 | bard = Bard(token=BARD_TOKEN)
59 | image = open(img_path, 'rb').read() # (jpeg, png, webp) are supported.
60 | bard_answer = bard.ask_about_image(prompt, image)
61 | response = bard_answer['content']
62 | if bard_error in response:
63 | time.sleep(60)
64 | break
65 |
66 | print(f"Response: {response}")
67 | results[id] = response
68 | with open(results_path, 'w') as f:
69 | json.dump(results, f, indent=4)
70 | break
71 |
--------------------------------------------------------------------------------
/inference/bard.sh:
--------------------------------------------------------------------------------
1 | while true; do python bard.py; sleep 60; done
2 |
--------------------------------------------------------------------------------
/inference/claude.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python claude.py --mmvet_path /path/to/mm-vet --anthropic_api_key
4 | """
5 | import os
6 | import argparse
7 | import anthropic
8 | from utils import evaluate_on_mmvet, encode_image
9 |
10 |
11 | class Claude:
12 | def __init__(self, api_key,
13 | model="claude-3-opus-20240229", temperature=0.0,
14 | max_tokens=512, system=None):
15 | self.model = model
16 | self.client = anthropic.Anthropic(
17 | api_key=api_key,
18 | )
19 | self.system = system
20 | self.temperature = temperature
21 | self.max_tokens = max_tokens
22 |
23 | def get_response(self, image_path, prompt="What's in this image?"):
24 | base64_image = encode_image(image_path)
25 | image_format = "png" if image_path.endswith('.png') else "jpeg"
26 |
27 | messages = []
28 | content = [
29 | {
30 | "type": "text",
31 | "text": prompt,
32 | },
33 | {
34 | "type": "image",
35 | "source": {
36 | "type": "base64",
37 | "media_type": f"image/{image_format}",
38 | "data": base64_image,
39 | }
40 | }
41 |
42 | ]
43 |
44 | messages.append({
45 | "role": "user",
46 | "content": content,
47 | })
48 |
49 | payload = {
50 | "model": self.model,
51 | "messages": messages,
52 | "max_tokens": self.max_tokens,
53 | "temperature": self.temperature,
54 | }
55 |
56 | if self.system:
57 | payload["system"] = self.system
58 |
59 | response = self.client.messages.create(**payload)
60 | response_text = response.content[0].text
61 | return response_text.strip()
62 |
63 |
64 | def arg_parser():
65 | parser = argparse.ArgumentParser()
66 | parser.add_argument(
67 | "--mmvet_path",
68 | type=str,
69 | default="/path/to/mm-vet",
70 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
71 | )
72 | parser.add_argument(
73 | "--result_path",
74 | type=str,
75 | default="results",
76 | )
77 | parser.add_argument(
78 | "--anthropic_api_key", type=str, default=None,
79 | help="refer to https://docs.anthropic.com/claude/reference/getting-started-with-the-api"
80 | )
81 | parser.add_argument(
82 | "--model_name",
83 | type=str,
84 | default="claude-3-opus-20240229",
85 | help="Claude model name",
86 | )
87 | args = parser.parse_args()
88 | return args
89 |
90 |
91 | if __name__ == "__main__":
92 | args = arg_parser()
93 |
94 | # prepare the model
95 | if args.anthropic_api_key:
96 | ANTHROPIC_API_KEY = args.anthropic_api_key
97 | else:
98 | ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
99 |
100 | if ANTHROPIC_API_KEY is None:
101 | raise ValueError("Please set the ANTHROPIC_API_KEY environment variable or pass it as an argument")
102 |
103 | model = Claude(ANTHROPIC_API_KEY, model=args.model_name)
104 |
105 | # evalute on mm-vet
106 | evaluate_on_mmvet(args, model)
107 |
--------------------------------------------------------------------------------
/inference/gemini_vision.py:
--------------------------------------------------------------------------------
1 | """
2 | Please refer to https://ai.google.dev/tutorials/python_quickstart to get the API key
3 |
4 | Install with `pip install -q -U google-generativeai`,
5 | Then `python gemini_vision.py --mmvet_path /path/to/mm-vet --google_api_key YOUR_API_KEY`
6 | """
7 |
8 | import os
9 | import time
10 | from pathlib import Path
11 | import google.generativeai as genai
12 | import argparse
13 | from utils import evaluate_on_mmvet
14 |
15 | class Gemini:
16 | def __init__(self, model="gemini-pro-vision"):
17 | self.model = genai.GenerativeModel(model)
18 |
19 | def get_response(self, image_path, prompt) -> str:
20 | # Query the model
21 | text = ""
22 | while len(text) < 1:
23 | try:
24 | image_path = Path(image_path)
25 | image = {
26 | 'mime_type': f'image/{image_path.suffix[1:].replace("jpg", "jpeg")}',
27 | 'data': image_path.read_bytes()
28 | }
29 | response = self.model.generate_content(
30 | [
31 | # Add an example image
32 | image,
33 | # Add an example query
34 | prompt,
35 | ]
36 | )
37 | try:
38 | text = response._result.candidates[0].content.parts[0].text
39 | except:
40 | text = " "
41 | except Exception as error:
42 | print(error)
43 | sleep_time = 30
44 | print(f'Sleeping for {sleep_time} seconds')
45 | time.sleep(sleep_time)
46 | return text.strip()
47 |
48 |
49 | def arg_parser():
50 | parser = argparse.ArgumentParser()
51 | parser.add_argument(
52 | "--mmvet_path",
53 | type=str,
54 | default="/path/to/mm-vet",
55 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
56 | )
57 | parser.add_argument(
58 | "--result_path",
59 | type=str,
60 | default="results",
61 | )
62 | parser.add_argument(
63 | "--google_api_key", type=str, default=None,
64 | help="refer to https://ai.google.dev/tutorials/python_quickstart"
65 | )
66 | parser.add_argument(
67 | "--model_name",
68 | type=str,
69 | default="gemini-pro-vision",
70 | help="Gemini model name",
71 | )
72 | args = parser.parse_args()
73 | return args
74 |
75 |
76 | if __name__ == "__main__":
77 | args = arg_parser()
78 |
79 | # prepare the model
80 | if args.google_api_key:
81 | GOOGLE_API_KEY = args.google_api_key
82 | else:
83 | GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
84 |
85 | if GOOGLE_API_KEY is None:
86 | raise ValueError("Please set the GOOGLE_API_KEY environment variable or pass it as an argument")
87 |
88 | genai.configure(api_key=GOOGLE_API_KEY)
89 | model = Gemini(model=args.model_name)
90 |
91 | # evaluate on mm-vet
92 | evaluate_on_mmvet(args, model)
--------------------------------------------------------------------------------
/inference/gpt4v.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python gpt4v.py --mmvet_path /path/to/mm-vet --openai_api_key
4 | """
5 |
6 | import time
7 | import os
8 | import requests
9 | import argparse
10 | from utils import evaluate_on_mmvet, encode_image
11 |
12 |
13 | class GPT4V:
14 | def __init__(self, api_key, model="gpt-4-vision-preview", image_detail="auto",
15 | system_text="You are a helpful assistant. Generate a short and concise response to the following image text pair."):
16 | self.api_key = api_key
17 | self.model = model
18 | self.image_detail = image_detail
19 | self.system_text = system_text
20 | self.headers = {
21 | "Content-Type": "application/json",
22 | "Authorization": f"Bearer {self.api_key}"
23 | }
24 | self.url = "https://api.openai.com/v1/chat/completions"
25 |
26 | def get_response(self, image_path, prompt="What's in this image?"):
27 | base64_image = encode_image(image_path)
28 | image_format = "data:image/png;base64" if 'png' in image_path else "data:image/jpeg;base64"
29 | messages = []
30 | if self.system_text is not None or self.system_text != "":
31 | messages.append({
32 | "role": "system",
33 | "content": [
34 | {
35 | "type": "text",
36 | "text": self.system_text,
37 | },
38 | ]
39 | })
40 | messages.append({
41 | "role": "user",
42 | "content": [
43 | {
44 | "type": "text",
45 | "text": prompt
46 | },
47 | {
48 | "type": "image_url",
49 | "image_url": {
50 | "url": f"{image_format},{base64_image}",
51 | "detail": self.image_detail,
52 | }
53 | }
54 | ]
55 | })
56 |
57 | payload = {
58 | "model": self.model,
59 | "messages": messages,
60 | "max_tokens": 300,
61 | }
62 |
63 | response_text, retry, response_json, regular_time = '', 0, None, 30
64 | while len(response_text) < 1:
65 | retry += 1
66 | time.sleep(1)
67 | try:
68 | response = requests.post(self.url, headers=self.headers, json=payload)
69 | response_json = response.json()
70 | # print(response_json)
71 | except Exception as e:
72 | print(e)
73 | time.sleep(regular_time)
74 | continue
75 | if response.status_code != 200:
76 | print(response.headers,response.content)
77 | print(image_path)
78 | print(f"The response status code for is {response.status_code} (Not OK)")
79 | time.sleep(regular_time)
80 | continue
81 | if 'choices' not in response_json:
82 | time.sleep(regular_time)
83 | continue
84 | response_text = response_json["choices"][0]["message"]["content"]
85 | return response_text.strip()
86 |
87 |
88 | def arg_parser():
89 | parser = argparse.ArgumentParser()
90 | parser.add_argument(
91 | "--mmvet_path",
92 | type=str,
93 | default="/path/to/mm-vet",
94 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
95 | )
96 | parser.add_argument(
97 | "--result_path",
98 | type=str,
99 | default="results",
100 | )
101 | parser.add_argument(
102 | "--openai_api_key", type=str, default=None,
103 | help="refer to https://platform.openai.com/docs/quickstart?context=python"
104 | )
105 | parser.add_argument(
106 | "--model_name",
107 | type=str,
108 | default="gpt-4-vision-preview",
109 | help="GPT model name",
110 | )
111 | parser.add_argument(
112 | "--image_detail",
113 | type=str,
114 | default="auto",
115 | help="Refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding",
116 | )
117 | args = parser.parse_args()
118 | return args
119 |
120 |
121 | if __name__ == "__main__":
122 | args = arg_parser()
123 |
124 | # prepare the model
125 | if args.openai_api_key:
126 | OPENAI_API_KEY = args.openai_api_key
127 | else:
128 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
129 |
130 | if OPENAI_API_KEY is None:
131 | raise ValueError("Please set the OPENAI_API_KEY environment variable or pass it as an argument")
132 |
133 | model = GPT4V(OPENAI_API_KEY, model=args.model_name, image_detail=args.image_detail)
134 | args.model_name = f"{args.model_name}_detail-{args.image_detail}"
135 |
136 | # evaluate on mm-vet
137 | evaluate_on_mmvet(args, model)
--------------------------------------------------------------------------------
/inference/qwen.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python qwen.py --mmvet_path /path/to/mm-vet --dashscope_api_key
4 | """
5 | import os
6 | import argparse
7 | from utils import evaluate_on_mmvet
8 | from http import HTTPStatus
9 | import dashscope
10 |
11 | class Qwen:
12 | def __init__(self, model='qwen-vl-max'):
13 | self.model = model
14 |
15 | def get_response(self, image_path, prompt="What's in this image?"):
16 | messages = []
17 | content = [
18 | {
19 | "text": prompt,
20 | },
21 | {
22 | "image": f"file://{image_path}"
23 | }
24 | ]
25 |
26 | messages.append({
27 | "role": "user",
28 | "content": content,
29 | })
30 |
31 | payload = {
32 | "model": self.model,
33 | "messages": messages,
34 | }
35 |
36 | response = dashscope.MultiModalConversation.call(**payload)
37 | if response.status_code == HTTPStatus.OK:
38 | rps = response['output']['choices'][0]['message']['content']
39 | for rp in rps:
40 | if 'text' in rp:
41 | response_text = rp['text']
42 | return response_text.strip()
43 | else:
44 | print(response.code) # The error code.
45 | print(response.message) # The error message.
46 | return ""
47 |
48 |
49 | def arg_parser():
50 | parser = argparse.ArgumentParser()
51 | parser.add_argument(
52 | "--mmvet_path",
53 | type=str,
54 | default="/path/to/mm-vet",
55 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
56 | )
57 | parser.add_argument(
58 | "--result_path",
59 | type=str,
60 | default="results",
61 | )
62 | parser.add_argument(
63 | "--dashscope_api_key", type=str, default=None,
64 | help="refer to https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start"
65 | )
66 | parser.add_argument(
67 | "--model_name",
68 | type=str,
69 | default="qwen-vl-max",
70 | help="Qwen model name",
71 | )
72 | args = parser.parse_args()
73 | return args
74 |
75 |
76 | if __name__ == "__main__":
77 | args = arg_parser()
78 |
79 | # prepare the model
80 | if args.dashscope_api_key:
81 | DASHSCOPE_API_KEY = args.dashscope_api_key
82 | else:
83 | DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY')
84 |
85 | if DASHSCOPE_API_KEY is None:
86 | raise ValueError("Please set the DASHSCOPE_API_KEY environment variable or pass it as an argument")
87 |
88 | dashscope.api_key = DASHSCOPE_API_KEY
89 | model = Qwen(model=args.model_name)
90 |
91 | # evalute on mm-vet
92 | evaluate_on_mmvet(args, model)
93 |
--------------------------------------------------------------------------------
/inference/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import base64
4 |
5 |
6 | # Function to encode the image
7 | def encode_image(image_path):
8 | with open(image_path, "rb") as image_file:
9 | return base64.b64encode(image_file.read()).decode('utf-8')
10 |
11 |
12 | def evaluate_on_mmvet(args, model):
13 | if os.path.exists(args.result_path) is False:
14 | os.makedirs(args.result_path)
15 | results_path = os.path.join(args.result_path, f"{args.model_name}.json")
16 | image_folder = os.path.join(args.mmvet_path, "images")
17 | meta_data = os.path.join(args.mmvet_path, "mm-vet.json")
18 |
19 | with open(meta_data, 'r') as f:
20 | data = json.load(f)
21 |
22 | if os.path.exists(results_path):
23 | with open(results_path, 'r') as f:
24 | results = json.load(f)
25 | else:
26 | results = {}
27 |
28 | for i in range(len(data)):
29 | id = f"v1_{i}"
30 | if id in results:
31 | continue
32 | imagename = data[id]['imagename']
33 | img_path = os.path.join(image_folder, imagename)
34 | prompt = data[id]['question']
35 | print(f"\n{id}")
36 | print(f"Image: {imagename}")
37 | print(f"Prompt: {prompt}")
38 | response = model.get_response(img_path, prompt)
39 | print(f"Response: {response}")
40 | results[id] = response
41 | with open(results_path, 'w') as f:
42 | json.dump(results, f, indent=4)
--------------------------------------------------------------------------------
/mm-vet_evaluator.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from openai import OpenAI\n",
10 | "import json\n",
11 | "import os\n",
12 | "from tqdm import tqdm\n",
13 | "import pandas as pd\n",
14 | "import numpy as np\n",
15 | "from collections import Counter\n",
16 | "import time\n",
17 | "import pathlib\n",
18 | "client = OpenAI(\n",
19 | " # This is the default and can be omitted\n",
20 | " api_key=os.environ.get(\"OPENAI_API_KEY\"),\n",
21 | ")"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "gpt_model = \"gpt-4-0613\"\n",
31 | "\n",
32 | "\n",
33 | "prompt = \"\"\"Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score.\n",
34 | "\n",
35 | "Question | Ground truth | Prediction | Correctness\n",
36 | "--- | --- | --- | ---\n",
37 | "What is x in the equation? | -1 -5 | x = 3 | 0.0\n",
38 | "What is x in the equation? | -1 -5 | x = -1 | 0.5\n",
39 | "What is x in the equation? | -1 -5 | x = -5 | 0.5\n",
40 | "What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5\n",
41 | "What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0\n",
42 | "Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4\n",
43 | "Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0\n",
44 | "\"\"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# load metadata\n",
54 | "# Download mm-vet.zip and `unzip mm-vet.zip` and change the path below\n",
55 | "mmvet_path = \"/path/to/mm-vet\"\n",
56 | "use_sub_set = False\n",
57 | "decimal_places = 1 # number of decimal places to round to\n",
58 | "\n",
59 | "\n",
60 | "if use_sub_set:\n",
61 | " bard_set_file = os.path.join(mmvet_path, \"bard_set.json\")\n",
62 | " with open(bard_set_file, 'r') as f:\n",
63 | " sub_set = json.load(f)\n",
64 | " sub_set_name = 'bardset'\n",
65 | " sub_set_name = sub_set_name + '_'\n",
66 | "else:\n",
67 | " sub_set = None\n",
68 | " sub_set_name = ''\n",
69 | "\n",
70 | "mmvet_metadata = os.path.join(mmvet_path, \"mm-vet.json\")\n",
71 | "with open(mmvet_metadata, 'r') as f:\n",
72 | " data = json.load(f)\n",
73 | "\n",
74 | "\n",
75 | "counter = Counter()\n",
76 | "cap_set_list = []\n",
77 | "cap_set_counter = []\n",
78 | "len_data = 0\n",
79 | "for id, value in data.items():\n",
80 | " if sub_set is not None and id not in sub_set:\n",
81 | " continue\n",
82 | " question = value[\"question\"]\n",
83 | " answer = value[\"answer\"]\n",
84 | " cap = value[\"capability\"]\n",
85 | " cap = set(cap)\n",
86 | " counter.update(cap)\n",
87 | " if cap not in cap_set_list:\n",
88 | " cap_set_list.append(cap)\n",
89 | " cap_set_counter.append(1)\n",
90 | " else:\n",
91 | " cap_set_counter[cap_set_list.index(cap)] += 1\n",
92 | " \n",
93 | " len_data += 1\n",
94 | "\n",
95 | "sorted_list = counter.most_common()\n",
96 | "columns = [k for k, v in sorted_list]\n",
97 | "columns.append(\"total\")\n",
98 | "columns.append(\"std\")\n",
99 | "columns.append('runs')\n",
100 | "df = pd.DataFrame(columns=columns)\n",
101 | "\n",
102 | "\n",
103 | "cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter))\n",
104 | "new_cap_set_list = []\n",
105 | "new_cap_set_counter = []\n",
106 | "for index in cap_set_sorted_indices:\n",
107 | " new_cap_set_list.append(cap_set_list[index])\n",
108 | " new_cap_set_counter.append(cap_set_counter[index])\n",
109 | "\n",
110 | "cap_set_list = new_cap_set_list\n",
111 | "cap_set_counter = new_cap_set_counter\n",
112 | "cap_set_names = [\"_\".join(list(cap_set)) for cap_set in cap_set_list]\n",
113 | "\n",
114 | "columns2 = cap_set_names\n",
115 | "columns2.append(\"total\")\n",
116 | "columns2.append(\"std\")\n",
117 | "columns2.append('runs')\n",
118 | "df2 = pd.DataFrame(columns=columns2)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "result_file = \"results/llava_llama2_13b_chat.json\" # change your model result_file\n",
128 | "result_path = \"results\" # path to save grading results\n",
129 | "num_run = 1 # we set it as 5 in the paper\n",
130 | "\n",
131 | "if os.path.exists(result_file) is False:\n",
132 | " raise ValueError(\"Result file does not exist\")\n",
133 | "if not result_file.endswith(('.json', '.JSON')):\n",
134 | " raise ValueError(\"Result file should be a json file\")\n",
135 | "model = pathlib.Path(result_file).stem\n",
136 | "# grade results for each sample to svae\n",
137 | "grade_file = f'{model}_{gpt_model}-grade-{num_run}runs.json'\n",
138 | "grade_file = os.path.join(result_path, grade_file)\n",
139 | "\n",
140 | "# score results regarding capabilities/capability integration to save\n",
141 | "cap_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs.csv'\n",
142 | "cap_score_file = os.path.join(result_path, cap_score_file)\n",
143 | "cap_int_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs.csv'\n",
144 | "cap_int_score_file = os.path.join(result_path, cap_int_score_file)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 5,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "eval run 0\n"
157 | ]
158 | },
159 | {
160 | "name": "stderr",
161 | "output_type": "stream",
162 | "text": [
163 | " 0%| | 0/218 [00:00, ?it/s]"
164 | ]
165 | },
166 | {
167 | "name": "stderr",
168 | "output_type": "stream",
169 | "text": [
170 | "100%|██████████| 218/218 [03:53<00:00, 1.07s/it]\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "with open(result_file) as f:\n",
176 | " results = json.load(f)\n",
177 | "if os.path.exists(grade_file):\n",
178 | " with open(grade_file, 'r') as f:\n",
179 | " grade_results = json.load(f)\n",
180 | "else:\n",
181 | " grade_results = {}\n",
182 | "\n",
183 | "\n",
184 | "def need_more_runs():\n",
185 | " need_more_runs = False\n",
186 | " if len(grade_results) > 0:\n",
187 | " for k, v in grade_results.items():\n",
188 | " if len(v['score']) < num_run:\n",
189 | " need_more_runs = True\n",
190 | " break\n",
191 | " return need_more_runs or len(grade_results) < len_data\n",
192 | "\n",
193 | "\n",
194 | "while need_more_runs():\n",
195 | " for j in range(num_run):\n",
196 | " print(f'eval run {j}')\n",
197 | " for id, line in tqdm(data.items()):\n",
198 | " if sub_set is not None and id not in sub_set:\n",
199 | " continue\n",
200 | " if id in grade_results and len(grade_results[id]['score']) >= (j + 1):\n",
201 | " continue\n",
202 | "\n",
203 | " model_pred = results[id]\n",
204 | " \n",
205 | " question = prompt + '\\n' + ' | '.join([line['question'], line['answer'].replace(\"\", \" \").replace(\"\", \" \"), model_pred, \"\"])\n",
206 | " messages = [\n",
207 | " {\"role\": \"user\", \"content\": question},\n",
208 | " ]\n",
209 | "\n",
210 | " if id not in grade_results:\n",
211 | " sample_grade = {'model': [], 'content': [], 'score': []}\n",
212 | " else:\n",
213 | " sample_grade = grade_results[id]\n",
214 | "\n",
215 | " \n",
216 | " grade_sample_run_complete = False\n",
217 | " temperature = 0.0\n",
218 | "\n",
219 | " while not grade_sample_run_complete:\n",
220 | " try:\n",
221 | " response = client.chat.completions.create(\n",
222 | " model=gpt_model,\n",
223 | " max_tokens=3,\n",
224 | " temperature=temperature,\n",
225 | " messages=messages)\n",
226 | " content = response.choices[0].message.content\n",
227 | " flag = True\n",
228 | " try_time = 1\n",
229 | " while flag:\n",
230 | " try:\n",
231 | " content = content.split(' ')[0].strip()\n",
232 | " score = float(content)\n",
233 | " if score > 1.0 or score < 0.0:\n",
234 | " assert False\n",
235 | " flag = False\n",
236 | " except:\n",
237 | " question = prompt + '\\n' + ' | '.join([line['question'], line['answer'].replace(\"\", \" \").replace(\"\", \" \"), model_pred, \"\"]) + \"\\nPredict the correctness of the answer (digit): \"\n",
238 | " messages = [\n",
239 | " {\"role\": \"user\", \"content\": question},\n",
240 | " ]\n",
241 | " response = client.chat.completions.create(\n",
242 | " model=gpt_model,\n",
243 | " max_tokens=3,\n",
244 | " temperature=temperature,\n",
245 | " messages=messages)\n",
246 | " content = response.choices[0].message.content\n",
247 | " try_time += 1\n",
248 | " temperature += 0.5\n",
249 | " print(f\"{id} try {try_time} times\")\n",
250 | " print(content)\n",
251 | " if try_time > 5:\n",
252 | " score = 0.0\n",
253 | " flag = False\n",
254 | " grade_sample_run_complete = True\n",
255 | " except:\n",
256 | " # gpt4 may have token rate limit\n",
257 | " print(\"sleep 30s\")\n",
258 | " time.sleep(30)\n",
259 | "\n",
260 | " if len(sample_grade['model']) >= j + 1:\n",
261 | " sample_grade['model'][j] = response.model\n",
262 | " sample_grade['content'][j] = content\n",
263 | " sample_grade['score'][j] = score\n",
264 | " else:\n",
265 | " sample_grade['model'].append(response.model)\n",
266 | " sample_grade['content'].append(content)\n",
267 | " sample_grade['score'].append(score)\n",
268 | " grade_results[id] = sample_grade\n",
269 | "\n",
270 | " with open(grade_file, 'w') as f:\n",
271 | " json.dump(grade_results, f, indent=4)"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 6,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "assert not need_more_runs()\n",
281 | "cap_socres = {k: [0.0]*num_run for k in columns[:-2]}\n",
282 | "counter['total'] = len_data\n",
283 | "\n",
284 | "cap_socres2 = {k: [0.0]*num_run for k in columns2[:-2]}\n",
285 | "counter2 = {columns2[i]:cap_set_counter[i] for i in range(len(cap_set_counter))}\n",
286 | "counter2['total'] = len_data\n",
287 | "\n",
288 | "for k, v in grade_results.items():\n",
289 | " if sub_set is not None and k not in sub_set:\n",
290 | " continue\n",
291 | " for i in range(num_run):\n",
292 | " score = v['score'][i]\n",
293 | " caps = set(data[k]['capability'])\n",
294 | " for c in caps:\n",
295 | " cap_socres[c][i] += score\n",
296 | " \n",
297 | " cap_socres['total'][i] += score\n",
298 | "\n",
299 | " index = cap_set_list.index(caps)\n",
300 | " cap_socres2[cap_set_names[index]][i] += score\n",
301 | " cap_socres2['total'][i] += score\n",
302 | "\n",
303 | "for k, v in cap_socres.items():\n",
304 | " cap_socres[k] = np.array(v) / counter[k] *100\n",
305 | "\n",
306 | "\n",
307 | "std = round(cap_socres['total'].std(), decimal_places)\n",
308 | "total_copy = cap_socres['total'].copy()\n",
309 | "runs = str(list(np.round(total_copy, decimal_places)))\n",
310 | "\n",
311 | "for k, v in cap_socres.items():\n",
312 | " cap_socres[k] = round(v.mean(), decimal_places)\n",
313 | "\n",
314 | "cap_socres['std'] = std\n",
315 | "cap_socres['runs'] = runs\n",
316 | "df.loc[model] = cap_socres\n",
317 | "\n",
318 | "\n",
319 | "for k, v in cap_socres2.items():\n",
320 | " cap_socres2[k] = round(np.mean(np.array(v) / counter2[k] *100), decimal_places)\n",
321 | "cap_socres2['std'] = std\n",
322 | "cap_socres2['runs'] = runs\n",
323 | "df2.loc[model] = cap_socres2\n",
324 | "\n",
325 | "df.to_csv(cap_score_file)\n",
326 | "df2.to_csv(cap_int_score_file)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 7,
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "data": {
336 | "text/html": [
337 | "\n",
338 | "\n",
351 | "
\n",
352 | " \n",
353 | " \n",
354 | " | \n",
355 | " rec | \n",
356 | " ocr | \n",
357 | " know | \n",
358 | " gen | \n",
359 | " spat | \n",
360 | " math | \n",
361 | " total | \n",
362 | " std | \n",
363 | " runs | \n",
364 | "
\n",
365 | " \n",
366 | " \n",
367 | " \n",
368 | " llava_llama2_13b_chat | \n",
369 | " 39.7 | \n",
370 | " 23.2 | \n",
371 | " 27.1 | \n",
372 | " 30.4 | \n",
373 | " 30.8 | \n",
374 | " 7.7 | \n",
375 | " 33.3 | \n",
376 | " 0.0 | \n",
377 | " [33.3] | \n",
378 | "
\n",
379 | " \n",
380 | "
\n",
381 | "
"
382 | ],
383 | "text/plain": [
384 | " rec ocr know gen spat math total std runs\n",
385 | "llava_llama2_13b_chat 39.7 23.2 27.1 30.4 30.8 7.7 33.3 0.0 [33.3]"
386 | ]
387 | },
388 | "execution_count": 7,
389 | "metadata": {},
390 | "output_type": "execute_result"
391 | }
392 | ],
393 | "source": [
394 | "# when use subset, please note the column order is different from the full set\n",
395 | "# because it ranks by numbers of capabilties/capability integrations\n",
396 | "df"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 8,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "data": {
406 | "text/html": [
407 | "\n",
408 | "\n",
421 | "
\n",
422 | " \n",
423 | " \n",
424 | " | \n",
425 | " rec_know_gen | \n",
426 | " rec | \n",
427 | " ocr_spat | \n",
428 | " ocr_spat_math | \n",
429 | " rec_spat | \n",
430 | " ocr | \n",
431 | " ocr_math | \n",
432 | " rec_know | \n",
433 | " ocr_rec_know_gen | \n",
434 | " ocr_rec_spat_gen | \n",
435 | " ocr_rec_spat | \n",
436 | " ocr_rec | \n",
437 | " ocr_spat_know | \n",
438 | " rec_spat_know | \n",
439 | " ocr_spat_gen | \n",
440 | " ocr_rec_spat_math | \n",
441 | " total | \n",
442 | " std | \n",
443 | " runs | \n",
444 | "
\n",
445 | " \n",
446 | " \n",
447 | " \n",
448 | " llava_llama2_13b_chat | \n",
449 | " 30.5 | \n",
450 | " 59.5 | \n",
451 | " 23.5 | \n",
452 | " 14.3 | \n",
453 | " 58.3 | \n",
454 | " 31.7 | \n",
455 | " 0.0 | \n",
456 | " 27.8 | \n",
457 | " 5.0 | \n",
458 | " 60.0 | \n",
459 | " 28.6 | \n",
460 | " 50.0 | \n",
461 | " 33.3 | \n",
462 | " 0.0 | \n",
463 | " 10.0 | \n",
464 | " 0.0 | \n",
465 | " 33.3 | \n",
466 | " 0.0 | \n",
467 | " [33.3] | \n",
468 | "
\n",
469 | " \n",
470 | "
\n",
471 | "
"
472 | ],
473 | "text/plain": [
474 | " rec_know_gen rec ocr_spat ocr_spat_math rec_spat \\\n",
475 | "llava_llama2_13b_chat 30.5 59.5 23.5 14.3 58.3 \n",
476 | "\n",
477 | " ocr ocr_math rec_know ocr_rec_know_gen \\\n",
478 | "llava_llama2_13b_chat 31.7 0.0 27.8 5.0 \n",
479 | "\n",
480 | " ocr_rec_spat_gen ocr_rec_spat ocr_rec ocr_spat_know \\\n",
481 | "llava_llama2_13b_chat 60.0 28.6 50.0 33.3 \n",
482 | "\n",
483 | " rec_spat_know ocr_spat_gen ocr_rec_spat_math total \\\n",
484 | "llava_llama2_13b_chat 0.0 10.0 0.0 33.3 \n",
485 | "\n",
486 | " std runs \n",
487 | "llava_llama2_13b_chat 0.0 [33.3] "
488 | ]
489 | },
490 | "execution_count": 8,
491 | "metadata": {},
492 | "output_type": "execute_result"
493 | }
494 | ],
495 | "source": [
496 | "df2"
497 | ]
498 | }
499 | ],
500 | "metadata": {
501 | "kernelspec": {
502 | "display_name": "base",
503 | "language": "python",
504 | "name": "python3"
505 | },
506 | "language_info": {
507 | "codemirror_mode": {
508 | "name": "ipython",
509 | "version": 3
510 | },
511 | "file_extension": ".py",
512 | "mimetype": "text/x-python",
513 | "name": "python",
514 | "nbconvert_exporter": "python",
515 | "pygments_lexer": "ipython3",
516 | "version": "3.10.9"
517 | },
518 | "orig_nbformat": 4
519 | },
520 | "nbformat": 4,
521 | "nbformat_minor": 2
522 | }
523 |
--------------------------------------------------------------------------------
/mm-vet_evaluator.py:
--------------------------------------------------------------------------------
1 | # in case you want to run this script independently
2 |
3 | import argparse
4 | from openai import OpenAI
5 | from openai._exceptions import RateLimitError
6 | import json
7 | import os
8 | from tqdm import tqdm
9 | import pandas as pd
10 | import numpy as np
11 | from collections import Counter
12 | import time
13 | import pathlib
14 |
15 | prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score.
16 |
17 | Question | Ground truth | Prediction | Correctness
18 | --- | --- | --- | ---
19 | What is x in the equation? | -1 -5 | x = 3 | 0.0
20 | What is x in the equation? | -1 -5 | x = -1 | 0.5
21 | What is x in the equation? | -1 -5 | x = -5 | 0.5
22 | What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5
23 | What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0
24 | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4
25 | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0
26 | """
27 |
28 |
29 | def arg_parser(prompt=prompt):
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument(
32 | "--mmvet_path",
33 | type=str,
34 | default="/path/to/mm-vet",
35 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
36 | )
37 | parser.add_argument(
38 | "--result_file",
39 | type=str,
40 | default="results/llava_llama2_13b_chat.json",
41 | help="path to the model result file, must end with .json",
42 | )
43 | parser.add_argument(
44 | "--result_path",
45 | type=str,
46 | default="results",
47 | help="path to save the grading results",
48 | )
49 | parser.add_argument(
50 | "--openai_api_key", type=str, default=None,
51 | help="If not specified, use OPENAI_API_KEY environment variable."
52 | )
53 | parser.add_argument(
54 | "--gpt_model", type=str, default="gpt-4-0613", help="gpt model name"
55 | )
56 | parser.add_argument(
57 | "--prompt", type=str, default=prompt, help="prompt for the model"
58 | )
59 | parser.add_argument(
60 | "--use_sub_set",
61 | action="store_true",
62 | help="use a subset of the data for debugging",
63 | )
64 | parser.add_argument(
65 | "--decimal_places",
66 | type=int,
67 | default=1,
68 | help="number of decimal places to round to",
69 | )
70 | parser.add_argument(
71 | "--num_run",
72 | type=int,
73 | default=1,
74 | help="we set it as 5 in the paper",
75 | )
76 | args = parser.parse_args()
77 | return args
78 |
79 | def get_file_names(args, model, sub_set_name):
80 | # grade results for each sample to svae
81 | grade_file = f"{model}_{args.gpt_model}-grade-{args.num_run}runs.json"
82 | grade_file = os.path.join(args.result_path, grade_file)
83 |
84 | # score results regarding capabilities/capability integration to save
85 | cap_score_file = (
86 | f"{model}_{sub_set_name}{args.gpt_model}-cap-score-{args.num_run}runs.csv"
87 | )
88 | cap_score_file = os.path.join(args.result_path, cap_score_file)
89 | cap_int_score_file = f"{model}_{sub_set_name}{args.gpt_model}-cap-int-score-{args.num_run}runs.csv"
90 | cap_int_score_file = os.path.join(args.result_path, cap_int_score_file)
91 | return grade_file, cap_score_file, cap_int_score_file
92 |
93 |
94 | def load_metadata(args):
95 | if args.use_sub_set:
96 | bard_set_file = os.path.join(args.mmvet_path, "bard_set.json")
97 | with open(bard_set_file, "r") as f:
98 | sub_set = json.load(f)
99 | sub_set_name = "bardset"
100 | sub_set_name = sub_set_name + "_"
101 | else:
102 | sub_set = None
103 | sub_set_name = ""
104 |
105 | mmvet_metadata = os.path.join(args.mmvet_path, "mm-vet.json")
106 | with open(mmvet_metadata, "r") as f:
107 | data = json.load(f)
108 |
109 | counter = Counter()
110 | cap_set_list = []
111 | cap_set_counter = []
112 | len_data = 0
113 | for id, value in data.items():
114 | if sub_set is not None and id not in sub_set:
115 | continue
116 | cap = value["capability"]
117 | cap = set(cap)
118 | counter.update(cap)
119 | if cap not in cap_set_list:
120 | cap_set_list.append(cap)
121 | cap_set_counter.append(1)
122 | else:
123 | cap_set_counter[cap_set_list.index(cap)] += 1
124 |
125 | len_data += 1
126 |
127 | sorted_list = counter.most_common()
128 | columns = [k for k, v in sorted_list]
129 | columns.append("total")
130 | columns.append("std")
131 | columns.append("runs")
132 | df = pd.DataFrame(columns=columns)
133 |
134 | cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter))
135 | new_cap_set_list = []
136 | new_cap_set_counter = []
137 | for index in cap_set_sorted_indices:
138 | new_cap_set_list.append(cap_set_list[index])
139 | new_cap_set_counter.append(cap_set_counter[index])
140 |
141 | cap_set_list = new_cap_set_list
142 | cap_set_counter = new_cap_set_counter
143 | cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list]
144 |
145 | columns2 = cap_set_names
146 | columns2.append("total")
147 | columns2.append("std")
148 | columns2.append("runs")
149 | df2 = pd.DataFrame(columns=columns2)
150 | return (
151 | sub_set,
152 | sub_set_name,
153 | data,
154 | counter,
155 | cap_set_list,
156 | cap_set_counter,
157 | len_data,
158 | df,
159 | df2,
160 | cap_set_names,
161 | )
162 |
163 |
164 | def runs(
165 | args,
166 | grade_file,
167 | data,
168 | len_data,
169 | sub_set=None,
170 | ):
171 | with open(args.result_file) as f:
172 | results = json.load(f)
173 | if os.path.exists(grade_file):
174 | with open(grade_file, "r") as f:
175 | grade_results = json.load(f)
176 | else:
177 | grade_results = {}
178 |
179 | def need_more_runs(args, grade_results, len_data):
180 | need_more_runs = False
181 | if len(grade_results) > 0:
182 | for k, v in grade_results.items():
183 | if len(v["score"]) < args.num_run:
184 | need_more_runs = True
185 | break
186 | return need_more_runs or len(grade_results) < len_data
187 |
188 | while need_more_runs(args, grade_results, len_data):
189 | for j in range(args.num_run):
190 | print(f"eval run {j}")
191 | for id, line in tqdm(data.items()):
192 | if sub_set is not None and id not in sub_set:
193 | continue
194 | if id in grade_results and len(grade_results[id]["score"]) >= (j + 1):
195 | continue
196 |
197 | model_pred = results[id]
198 |
199 | question = (
200 | args.prompt
201 | + "\n"
202 | + " | ".join(
203 | [
204 | line["question"],
205 | line["answer"]
206 | .replace("", " ")
207 | .replace("", " "),
208 | model_pred,
209 | "",
210 | ]
211 | )
212 | )
213 | messages = [
214 | {"role": "user", "content": question},
215 | ]
216 |
217 | if id not in grade_results:
218 | sample_grade = {"model": [], "content": [], "score": []}
219 | else:
220 | sample_grade = grade_results[id]
221 |
222 | grade_sample_run_complete = False
223 | temperature = 0.0
224 |
225 | while not grade_sample_run_complete:
226 | try:
227 | response = client.chat.completions.create(
228 | model=args.gpt_model,
229 | max_tokens=3,
230 | temperature=temperature,
231 | messages=messages,
232 | )
233 | content = response.choices[0].message.content
234 | flag = True
235 | try_time = 1
236 | while flag:
237 | try:
238 | content = content.split(" ")[0].strip()
239 | score = float(content)
240 | if score > 1.0 or score < 0.0:
241 | assert False
242 | flag = False
243 | except:
244 | question = (
245 | args.prompt
246 | + "\n"
247 | + " | ".join(
248 | [
249 | line["question"],
250 | line["answer"]
251 | .replace("", " ")
252 | .replace("", " "),
253 | model_pred,
254 | "",
255 | ]
256 | )
257 | + "\nPredict the correctness of the answer (digit): "
258 | )
259 | messages = [
260 | {"role": "user", "content": question},
261 | ]
262 | response = client.chat.completions.create(
263 | model=args.gpt_model,
264 | max_tokens=3,
265 | temperature=temperature,
266 | messages=messages,
267 | )
268 | content = response.choices[0].message.content
269 | try_time += 1
270 | temperature += 0.5
271 | print(f"{id} try {try_time} times")
272 | print(content)
273 | if try_time > 5:
274 | score = 0.0
275 | flag = False
276 | grade_sample_run_complete = True
277 | except RateLimitError as e:
278 | # gpt4 may have token rate limit
279 | print("sleep 30s")
280 | time.sleep(30)
281 |
282 | if len(sample_grade["model"]) >= j + 1:
283 | sample_grade["model"][j] = response.model
284 | sample_grade["content"][j] = content
285 | sample_grade["score"][j] = score
286 | else:
287 | sample_grade["model"].append(response.model)
288 | sample_grade["content"].append(content)
289 | sample_grade["score"].append(score)
290 | grade_results[id] = sample_grade
291 |
292 | with open(grade_file, "w") as f:
293 | json.dump(grade_results, f, indent=4)
294 |
295 | return grade_results
296 |
297 |
298 | def export_result(args, model, df, df2, grade_results, data, cap_set_counter, cap_set_names):
299 | columns = df.columns
300 | columns2 = df2.columns
301 |
302 | cap_socres = {k: [0.0] * args.num_run for k in columns[:-2]}
303 | counter["total"] = len_data
304 |
305 | cap_socres2 = {k: [0.0] * args.num_run for k in columns2[:-2]}
306 | counter2 = {columns2[i]: cap_set_counter[i] for i in range(len(cap_set_counter))}
307 | counter2["total"] = len_data
308 |
309 | for k, v in grade_results.items():
310 | if sub_set is not None and k not in sub_set:
311 | continue
312 | for i in range(args.num_run):
313 | score = v["score"][i]
314 | caps = set(data[k]["capability"])
315 | for c in caps:
316 | cap_socres[c][i] += score
317 |
318 | cap_socres["total"][i] += score
319 |
320 | index = cap_set_list.index(caps)
321 | cap_socres2[cap_set_names[index]][i] += score
322 | cap_socres2["total"][i] += score
323 |
324 | for k, v in cap_socres.items():
325 | cap_socres[k] = np.array(v) / counter[k] * 100
326 |
327 | std = round(cap_socres["total"].std(), args.decimal_places)
328 | total_copy = cap_socres["total"].copy()
329 | runs = str(list(np.round(total_copy, args.decimal_places)))
330 |
331 | for k, v in cap_socres.items():
332 | cap_socres[k] = round(v.mean(), args.decimal_places)
333 |
334 | cap_socres["std"] = std
335 | cap_socres["runs"] = runs
336 | df.loc[model] = cap_socres
337 |
338 | for k, v in cap_socres2.items():
339 | cap_socres2[k] = round(
340 | np.mean(np.array(v) / counter2[k] * 100), args.decimal_places
341 | )
342 | cap_socres2["std"] = std
343 | cap_socres2["runs"] = runs
344 | df2.loc[model] = cap_socres2
345 |
346 | df.to_csv(cap_score_file)
347 | df2.to_csv(cap_int_score_file)
348 |
349 | return df, df2
350 |
351 |
352 | if __name__ == "__main__":
353 | args = arg_parser()
354 | if args.openai_api_key:
355 | OPENAI_API_KEY = args.openai_api_key
356 | else:
357 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
358 | client = OpenAI(
359 | api_key=OPENAI_API_KEY
360 | )
361 |
362 | if os.path.exists(args.result_file) is False:
363 | raise ValueError("Result file does not exist")
364 | if not args.result_file.endswith(('.json', '.JSON')):
365 | raise ValueError("Result file should be a json file")
366 | model = pathlib.Path(args.result_file).stem
367 |
368 | metadata = load_metadata(args)
369 | (
370 | sub_set,
371 | sub_set_name,
372 | data,
373 | counter,
374 | cap_set_list,
375 | cap_set_counter,
376 | len_data,
377 | df,
378 | df2,
379 | cap_set_names,
380 | ) = metadata
381 | file_names = get_file_names(args, model, sub_set_name)
382 | (
383 | grade_file,
384 | cap_score_file,
385 | cap_int_score_file,
386 | ) = file_names
387 | grade_results = runs(
388 | args,
389 | grade_file,
390 | data,
391 | len_data,
392 | sub_set,
393 | )
394 | df, df2 = export_result(
395 | args,
396 | model,
397 | df,
398 | df2,
399 | grade_results,
400 | data,
401 | cap_set_counter,
402 | cap_set_names,
403 | )
404 | print(df)
405 | print("\n")
406 | print(df2)
407 | print("\n")
408 | print(f"Grading results are saved in:\n{grade_file}\n{cap_score_file}\n{cap_int_score_file}")
--------------------------------------------------------------------------------
/v2/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # [MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2408.00765)
7 |
8 |
9 | [Paper]
10 | [Download Dataset]
11 | [Dataset on Hugging Face]
12 | [Leaderboard]
13 | [Online Evaluator]
14 |
15 |
16 |
17 |
18 | 
19 | Figure 1: Four examples from MM-Vet v2. Compared with MM-Vet, MM-Vet v2 introduces more high-quality evaluation samples (e.g., (a) and (b)), and the ones with the new capability of image-text sequence understanding (e.g., (c) and (d)).
20 |
21 | The code is under the Apache 2.0 license, and the dataset is under the CC BY-NC 4.0 license.
22 |
23 | ## Evalute your model on MM-Vet v2
24 | **Step 0**: Install openai package with `pip install openai>=1` and get access GPT-4 API. If you have not access, you can try MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) (but it may wait for long time depending on number of users).
25 |
26 | **Step 1**: Download MM-Vet v2 data [here](https://github.com/yuweihao/MM-Vet/releases/download/v2/mm-vet-v2.zip) and unzip `unzip mm-vet-v2.zip`.
27 |
28 | **Step 2**: Infer your model on MM-Vet v2 and save your model outputs in json like [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json), or just use [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json) as example to evaluate. We also release inference scripts for GPT-4, Claude and Gemini.
29 |
30 | ```bash
31 | image_detail=high # or auto, low refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding
32 |
33 | python inference/gpt4.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gpt-4o-2024-05-13 --image_detail ${image_detail}
34 | ```
35 |
36 | ```bash
37 | python inference/claude.py --mmvetv2_path /path/to/mm-vet-v2 --model_name claude-3-5-sonnet-20240620
38 | ```
39 |
40 | ```bash
41 | python inference/gemini.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gemini-1.5-pro
42 | ```
43 |
44 | **Step 3**: `git clone https://github.com/yuweihao/MM-Vet.git && cd MM-Vet/v2`, run LLM-based evaluator
45 | ```bash
46 | python mm-vet-v2_evaluator.py --mmvetv2_path /path/to/mm-vet-v2 --result_file results/gpt-4o-2024-05-13_detail-high.json
47 | ```
48 | If you cannot access GPT-4 (gpt-4-0613), you can upload your model output results (json file) to MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) to get the grading results.
49 |
50 | ## Some results
51 | 
52 |
53 |
54 | ## Some interesting samples
55 |
56 | 
57 |
58 | **Q**: As shown in the image, two iron balls are hanging on the Leaning Tower of Pisa, ball A weighs 20kg, and ball B weighs 5kg. If the ropes hanging them are cut at the same time and air resistance is ignored, which iron ball will land first?
59 |
60 | **GT**: A
61 |
62 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge
63 |
64 | ---
65 |
66 | 
67 |
68 | **Q**: How many feet do these animals have in total?
69 |
70 | **GT**: 10
71 |
72 | **Required capabilities**: Recognition, knowledge, math
73 |
74 | ---
75 |
76 | 
77 |
78 | **Q**: How many feet do these animals have in total?
79 |
80 | **GT**: 16
81 |
82 | **Required capabilities**: Recognition, knowledge, math
83 |
84 | ---
85 |
86 | 
87 |
88 | **Q**: Is it possible for the car to move with magnetic force according to the Physical laws?
89 |
90 | **GT**: yes
91 |
92 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge
93 |
94 | ---
95 |
96 | 
97 |
98 | **Q**: Which track should the trolley go on, A or B?
99 |
100 | **GT**: A
101 |
102 | **Required capabilities**: Recognition, spatial awareness
103 |
104 | ---
105 |
106 | 
107 |
108 | **Q**: Can we make sure the cat is alive before we open the box?
109 |
110 | **GT**: yes
111 |
112 | **Required capabilities**: Recognition, spatial awareness, knowledge
113 |
114 | ---
115 |
116 | 
117 |
118 | **Q**: From location A to location B, is it faster to go east or west?
119 |
120 | **GT**: east
121 |
122 | **Required capabilities**: Recognition, spatial awareness, knowledge
123 |
124 | ---
125 |
126 | 
127 |
128 | **Q**: Neglecting air buoyancy (vacuum), which side will go down, iron or cotton?
129 |
130 | **GT**: iron
131 |
132 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge
133 |
134 | ---
135 |
136 | 
137 |
138 | **Q**: How many dwarfs are there near Snow White in the image?
139 |
140 | **GT**: 6
141 |
142 | **Required capabilities**: Recognition, spatial awareness
143 |
144 |
145 | ## Citation
146 | ```
147 | @article{yu2024mmvetv2,
148 | title={MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities},
149 | author={Weihao Yu and Zhengyuan Yang and Lingfeng Ren and Linjie Li and Jianfeng Wang and Kevin Lin and Chung-Ching Lin and Zicheng Liu and Lijuan Wang and Xinchao Wang},
150 | journal={arXiv preprint arXiv:2408.00765},
151 | year={2024}
152 | }
153 | ```
--------------------------------------------------------------------------------
/v2/inference/claude.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | import os
4 | import base64
5 | import requests
6 | import argparse
7 | import anthropic
8 |
9 |
10 | # Function to encode the image
11 | def encode_image(image_path):
12 | with open(image_path, "rb") as image_file:
13 | return base64.b64encode(image_file.read()).decode('utf-8')
14 |
15 |
16 | class Claude:
17 | def __init__(self, api_key,
18 | model="claude-3-5-sonnet-20240620", temperature=0.0,
19 | max_tokens=512, system=None):
20 | self.model = model
21 | self.client = anthropic.Anthropic(
22 | api_key=api_key,
23 | )
24 | self.system = system
25 | self.temperature = temperature
26 | self.max_tokens = max_tokens
27 |
28 | def get_response(self, image_folder, prompt="What's in this image?"):
29 | messages = []
30 | content = []
31 | queries = prompt.split("
")
32 | img_num = 0
33 | for query in queries:
34 | query = query.strip()
35 | if query == "":
36 | continue
37 | if query.endswith((".jpg", ".png", ".jpeg")):
38 | image_path = os.path.join(image_folder, query)
39 | base64_image = encode_image(image_path)
40 | image_format = "png" if image_path.endswith('.png') else "jpeg"
41 | content.append(
42 | {
43 | "type": "image",
44 | "source": {
45 | "type": "base64",
46 | "media_type": f"image/{image_format}",
47 | "data": base64_image,
48 | }
49 | }
50 | )
51 | img_num += 1
52 | else:
53 | content.append(
54 | {
55 | "type": "text",
56 | "text": query
57 | },
58 | )
59 |
60 | messages.append({
61 | "role": "user",
62 | "content": content,
63 | })
64 |
65 | payload = {
66 | "model": self.model,
67 | "messages": messages,
68 | "max_tokens": self.max_tokens,
69 | "temperature": self.temperature,
70 | }
71 |
72 | if self.system:
73 | payload["system"] = self.system
74 |
75 | response = self.client.messages.create(**payload)
76 | response_text = response.content[0].text
77 | return response_text
78 |
79 |
80 | def arg_parser():
81 | parser = argparse.ArgumentParser()
82 | parser.add_argument(
83 | "--mmvetv2_path",
84 | type=str,
85 | default="/path/to/mm-vet-v2",
86 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
87 | )
88 | parser.add_argument(
89 | "--result_path",
90 | type=str,
91 | default="results",
92 | )
93 | parser.add_argument(
94 | "--anthropic_api_key", type=str, default=None,
95 | help="refer to https://platform.openai.com/docs/quickstart?context=python"
96 | )
97 | parser.add_argument(
98 | "--model_name",
99 | type=str,
100 | default="claude-3-5-sonnet-20240620",
101 | help="Claude model name",
102 | )
103 | args = parser.parse_args()
104 | return args
105 |
106 |
107 | if __name__ == "__main__":
108 | args = arg_parser()
109 | model_name = args.model_name
110 | if os.path.exists(args.result_path) is False:
111 | os.makedirs(args.result_path)
112 | results_path = os.path.join(args.result_path, f"{model_name}.json")
113 | image_folder = os.path.join(args.mmvetv2_path, "images")
114 | meta_data = os.path.join(args.mmvetv2_path, "mm-vet-v2.json")
115 |
116 |
117 | if args.anthropic_api_key:
118 | ANTHROPIC_API_KEY = args.anthropic_api_key
119 | else:
120 | ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
121 |
122 | if ANTHROPIC_API_KEY is None:
123 | raise ValueError("Please set the ANTHROPIC_API_KEY environment variable or pass it as an argument")
124 |
125 | claude = Claude(ANTHROPIC_API_KEY, model=model_name)
126 |
127 | if os.path.exists(results_path):
128 | with open(results_path, "r") as f:
129 | results = json.load(f)
130 | else:
131 | results = {}
132 |
133 | with open(meta_data, "r") as f:
134 | data = json.load(f)
135 |
136 | for id in data:
137 | if id in results:
138 | continue
139 | prompt = data[id]["question"].strip()
140 | print(id)
141 | print(f"Prompt: {prompt}")
142 | response = claude.get_response(image_folder, prompt)
143 | print(f"Response: {response}")
144 | results[id] = response
145 | with open(results_path, "w") as f:
146 | json.dump(results, f, indent=4)
147 |
--------------------------------------------------------------------------------
/v2/inference/cogagent.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a demo for using CogAgent and CogVLM in CLI
3 | Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
4 | In this demo, We us chat template, you can use others to replace such as 'vqa'.
5 | Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
6 | Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
7 | """
8 |
9 | import argparse
10 | import torch
11 | import json
12 | import os
13 | from PIL import Image
14 | from transformers import AutoModelForCausalLM, LlamaTokenizer
15 | import pandas as pd
16 | from accelerate import (
17 | init_empty_weights,
18 | infer_auto_device_map,
19 | load_checkpoint_and_dispatch,
20 | )
21 | from utils import evaluate_on_mmvetv2, process_images_for_question
22 |
23 |
24 | class CogAgent:
25 | def __init__(
26 | self,
27 | model_name="THUDM/cogagent-chat-hf",
28 | tokenizer_name="",
29 | image_first=False,
30 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
31 | chat_format=True,
32 | ):
33 | self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
34 |
35 | self.tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)
36 | if args.bf16:
37 | self.torch_type = torch.bfloat16
38 | else:
39 | self.torch_type = torch.float16
40 |
41 | print(
42 | "========Use torch type as:{} with device:{}========\n\n".format(
43 | self.torch_type, self.DEVICE
44 | )
45 | )
46 | # tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
47 | with init_empty_weights():
48 | model = AutoModelForCausalLM.from_pretrained(
49 | model_name,
50 | torch_dtype=self.torch_type,
51 | low_cpu_mem_usage=True,
52 | trust_remote_code=True,
53 | )
54 | device_map = infer_auto_device_map(
55 | model,
56 | max_memory={0: "20GiB", 1: "20GiB"},
57 | no_split_module_classes=["CogAgentDecoderLayer"],
58 | )
59 | path = "~/.cache/huggingface/hub/models--THUDM--cogagent-chat-hf/snapshots/balabala" # typical, '~/.cache/huggingface/hub/models--THUDM--cogagent-chat-hf/snapshots/balabala'
60 | model = load_checkpoint_and_dispatch(
61 | model,
62 | path,
63 | device_map=device_map,
64 | )
65 | self.model = model.eval()
66 | self.system_message = system_message
67 | self.chat_format = chat_format
68 |
69 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
70 | images = []
71 | text_queries = []
72 | queries = prompt.split("
")
73 | for query in queries:
74 | query = query.strip()
75 | if query.endswith((".jpg", ".png", ".jpeg")):
76 | images.append(os.path.join(image_folder, query))
77 | text_queries.append("")
78 | else:
79 | text_queries.append(query)
80 | text_query = "".join(text_queries)
81 | image = process_images_for_question(images).convert("RGB")
82 | input_by_model = self.model.build_conversation_input_ids(
83 | self.tokenizer, query=text_query, history=None, images=[image]
84 | )
85 | inputs = {
86 | "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self.DEVICE),
87 | "token_type_ids": input_by_model["token_type_ids"]
88 | .unsqueeze(0)
89 | .to(self.DEVICE),
90 | "attention_mask": input_by_model["attention_mask"]
91 | .unsqueeze(0)
92 | .to(self.DEVICE),
93 | "images": [
94 | [input_by_model["images"][0].to(self.DEVICE).to(self.torch_type)]
95 | ],
96 | }
97 | if "cross_images" in input_by_model and input_by_model["cross_images"]:
98 | inputs["cross_images"] = [
99 | [input_by_model["cross_images"][0].to(self.DEVICE).to(self.torch_type)]
100 | ]
101 |
102 | # add any transformers params here.
103 | gen_kwargs = {"max_length": 2048, "temperature": 0.9, "do_sample": False}
104 | with torch.no_grad():
105 | outputs = self.model.generate(**inputs, **gen_kwargs)
106 | outputs = outputs[:, inputs["input_ids"].shape[1] :]
107 | response = self.tokenizer.decode(outputs[0])
108 | response = response.split("")[0]
109 | output_text = response
110 | return output_text
111 |
112 |
113 | def arg_parser():
114 | parser = argparse.ArgumentParser()
115 | parser.add_argument(
116 | "--quant", choices=[4], type=int, default=None, help="quantization bits"
117 | )
118 | parser.add_argument(
119 | "--model_name",
120 | type=str,
121 | default="THUDM/cogagent-chat-hf",
122 | help="pretrained ckpt",
123 | )
124 | parser.add_argument(
125 | "--local_tokenizer",
126 | type=str,
127 | default="lmsys/vicuna-7b-v1.5",
128 | help="tokenizer path",
129 | )
130 | parser.add_argument("--fp16", action="store_true")
131 | parser.add_argument("--bf16", action="store_true")
132 | parser.add_argument(
133 | "--mmvetv2_path",
134 | type=str,
135 | default="/path/to/mm-vet-v2",
136 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
137 | )
138 | parser.add_argument(
139 | "--result_path",
140 | type=str,
141 | default="results",
142 | )
143 | parser.add_argument(
144 | "--image_first",
145 | action="store_true",
146 | help="whether text",
147 | )
148 | parser.add_argument(
149 | "--chat_format",
150 | action="store_true",
151 | help="whether to use chat format",
152 | )
153 | args = parser.parse_args()
154 | return args
155 |
156 |
157 | # path = "/home/abc/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/e29dc3ba206d524bf8efbfc60d80fc4556ab0e3c"
158 | if __name__ == "__main__":
159 | args = arg_parser()
160 |
161 | model = CogAgent(
162 | args.model_name, args.local_tokenizer, image_first=args.image_first
163 | )
164 | if args.image_first:
165 | args.model_name = args.model_name + "-image-first"
166 | if args.chat_format:
167 | args.model_name = args.model_name + "-chat-format"
168 | print(args)
169 | evaluate_on_mmvetv2(args, model)
170 |
--------------------------------------------------------------------------------
/v2/inference/cogvlm.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a demo for using CogAgent and CogVLM in CLI
3 | Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required.
4 | In this demo, We us chat template, you can use others to replace such as 'vqa'.
5 | Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow.
6 | Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation.
7 | """
8 |
9 | import argparse
10 | import torch
11 | import json
12 | import os
13 | from PIL import Image
14 | from transformers import AutoModelForCausalLM, LlamaTokenizer
15 | import pandas as pd
16 |
17 | from accelerate import (
18 | init_empty_weights,
19 | infer_auto_device_map,
20 | load_checkpoint_and_dispatch,
21 | )
22 | from utils import evaluate_on_mmvetv2, process_images_for_question
23 |
24 |
25 | class CogVLM:
26 | def __init__(
27 | self,
28 | model_name="THUDM/cogvlm-chat-hf",
29 | tokenizer_name="",
30 | image_first=False,
31 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
32 | chat_format=True,
33 | ):
34 | self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
35 |
36 | self.tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name)
37 | if args.bf16:
38 | self.torch_type = torch.bfloat16
39 | else:
40 | self.torch_type = torch.float16
41 |
42 | print(
43 | "========Use torch type as:{} with device:{}========\n\n".format(
44 | self.torch_type, self.DEVICE
45 | )
46 | )
47 |
48 | with init_empty_weights():
49 | model = AutoModelForCausalLM.from_pretrained(
50 | model_name,
51 | torch_dtype=self.torch_type,
52 | low_cpu_mem_usage=True,
53 | trust_remote_code=True,
54 | )
55 | device_map = infer_auto_device_map(
56 | model,
57 | max_memory={0: "20GiB", 1: "20GiB"},
58 | no_split_module_classes=["CogVLMDecoderLayer", "TransformerLayer"],
59 | )
60 | path = (
61 | "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala"
62 | )
63 |
64 | model = load_checkpoint_and_dispatch(
65 | model,
66 | path, # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
67 | device_map=device_map,
68 | )
69 | self.model = model.eval()
70 | self.system_message = system_message
71 | self.chat_format = chat_format
72 |
73 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
74 | images = []
75 | text_queries = []
76 | queries = prompt.split("
")
77 | for query in queries:
78 | query = query.strip()
79 | if query.endswith((".jpg", ".png", ".jpeg")):
80 | images.append(os.path.join(image_folder, query))
81 | text_queries.append("")
82 | else:
83 | text_queries.append(query)
84 | text_query = "".join(text_queries)
85 | image = process_images_for_question(images).convert("RGB")
86 | input_by_model = self.model.build_conversation_input_ids(
87 | self.tokenizer, query=text_query, history=None, images=[image]
88 | )
89 | inputs = {
90 | "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self.DEVICE),
91 | "token_type_ids": input_by_model["token_type_ids"]
92 | .unsqueeze(0)
93 | .to(self.DEVICE),
94 | "attention_mask": input_by_model["attention_mask"]
95 | .unsqueeze(0)
96 | .to(self.DEVICE),
97 | "images": (
98 | [[input_by_model["images"][0].to(self.DEVICE).to(self.torch_type)]]
99 | if image is not None
100 | else None
101 | ),
102 | }
103 | if "cross_images" in input_by_model and input_by_model["cross_images"]:
104 | inputs["cross_images"] = [
105 | [input_by_model["cross_images"][0].to(self.DEVICE).to(self.torch_type)]
106 | ]
107 |
108 | # add any transformers params here.
109 | gen_kwargs = {"max_length": 2048, "do_sample": False} # "temperature": 0.9
110 | with torch.no_grad():
111 | outputs = self.model.generate(**inputs, **gen_kwargs)
112 | outputs = outputs[:, inputs["input_ids"].shape[1] :]
113 | response = self.tokenizer.decode(outputs[0])
114 | response = response.split("")[0].strip()
115 | output_text = response
116 | return output_text
117 |
118 |
119 | def arg_parser():
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument(
122 | "--quant", choices=[4], type=int, default=None, help="quantization bits"
123 | )
124 | parser.add_argument(
125 | "--model_name",
126 | type=str,
127 | default="THUDM/cogvlm-chat-hf",
128 | help="pretrained ckpt",
129 | )
130 | parser.add_argument(
131 | "--local_tokenizer",
132 | type=str,
133 | default="lmsys/vicuna-7b-v1.5",
134 | help="tokenizer path",
135 | )
136 | parser.add_argument("--fp16", action="store_true")
137 | parser.add_argument("--bf16", action="store_true")
138 | parser.add_argument(
139 | "--mmvetv2_path",
140 | type=str,
141 | default="/path/to/mm-vet-v2",
142 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
143 | )
144 | parser.add_argument(
145 | "--result_path",
146 | type=str,
147 | default="results",
148 | )
149 | parser.add_argument(
150 | "--image_first",
151 | action="store_true",
152 | help="whether text",
153 | )
154 | parser.add_argument(
155 | "--chat_format",
156 | action="store_true",
157 | help="whether to use chat format",
158 | )
159 | args = parser.parse_args()
160 | return args
161 |
162 |
163 | if __name__ == "__main__":
164 | args = arg_parser()
165 |
166 | model = CogVLM(args.model_name, args.local_tokenizer, image_first=args.image_first)
167 | if args.image_first:
168 | args.model_name = args.model_name + "-image-first"
169 | if args.chat_format:
170 | args.model_name = args.model_name + "-chat-format"
171 | print(args)
172 | evaluate_on_mmvetv2(args, model)
173 |
--------------------------------------------------------------------------------
/v2/inference/emu2.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import torch
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from accelerate import (
5 | init_empty_weights,
6 | infer_auto_device_map,
7 | load_checkpoint_and_dispatch,
8 | )
9 | import os
10 | import argparse
11 | from utils import evaluate_on_mmvetv2
12 |
13 |
14 | class Emu2:
15 | def __init__(
16 | self,
17 | model_name="BAAI/Emu2-Chat",
18 | image_first=False,
19 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
20 | chat_format=True,
21 | ):
22 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) # "BAAI/Emu2-Chat"
23 | # self.model = AutoModelForCausalLM.from_pretrained(
24 | # model_name,
25 | # torch_dtype=torch.bfloat16,
26 | # low_cpu_mem_usage=True,
27 | # trust_remote_code=True).to('cuda').eval()
28 | with init_empty_weights():
29 | model = AutoModelForCausalLM.from_pretrained(
30 | model_name,
31 | torch_dtype=torch.bfloat16,
32 | low_cpu_mem_usage=True,
33 | trust_remote_code=True,
34 | )
35 | device_map = infer_auto_device_map(
36 | model,
37 | max_memory={0: "16GIB", 1: "20GIB", 2: "20GIB", 3: "20GIB"},
38 | no_split_module_classes=["Block", "LlamaDecoderLayer"],
39 | )
40 | device_map["model.decoder.lm.lm_head"] = 0
41 | self.image_first = image_first
42 | self.model = load_checkpoint_and_dispatch(
43 | model,
44 | "/home/abc/.cache/huggingface/hub/models--BAAI--Emu2-Chat/snapshots/20ea30b04f8fee599cf97535e655c200df728501",
45 | device_map=device_map,
46 | ).eval()
47 | self.system_message = system_message
48 | self.chat_format = chat_format
49 |
50 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
51 | images = []
52 | text_queries = []
53 | queries = prompt.split("
")
54 | for query in queries:
55 | query = query.strip()
56 | if query.endswith((".jpg", ".png", ".jpeg")):
57 | image_path = os.path.join(image_folder, query)
58 | images.append(Image.open(image_path).convert("RGB"))
59 | text_queries.append("[]")
60 | else:
61 | text_queries.append(query)
62 |
63 | if self.image_first:
64 | for i in range(1, len(text_queries)):
65 | if text_queries[i] == "[]" and (
66 | text_queries[i - 1] != "[]"
67 | ):
68 | tmp = text_queries[i - 1]
69 | text_queries[i - 1] = text_queries[i]
70 | text_queries[i] = tmp
71 | text_query = "".join(text_queries)
72 | if self.chat_format:
73 | text_query = f"{self.system_message} [USER]: {text_query} [ASSISTANT]:"
74 | print(text_query)
75 | inputs = self.model.build_input_ids(
76 | text=[text_query], tokenizer=self.tokenizer, image=images
77 | )
78 |
79 | with torch.no_grad():
80 | outputs = self.model.generate(
81 | input_ids=inputs["input_ids"],
82 | attention_mask=inputs["attention_mask"],
83 | image=inputs["image"].to(torch.bfloat16),
84 | max_new_tokens=512,
85 | length_penalty=-1,
86 | )
87 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
88 | return output_text
89 |
90 |
91 | def arg_parser():
92 | parser = argparse.ArgumentParser()
93 | parser.add_argument(
94 | "--mmvetv2_path",
95 | type=str,
96 | default="/path/to/mm-vet-v2",
97 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
98 | )
99 | parser.add_argument(
100 | "--result_path",
101 | type=str,
102 | default="results",
103 | )
104 | parser.add_argument(
105 | "--model_name",
106 | type=str,
107 | default="BAAI/Emu2-Chat",
108 | help="pretrained ckpt",
109 | )
110 | parser.add_argument(
111 | "--image_first",
112 | action="store_true",
113 | help="whether text",
114 | )
115 | parser.add_argument(
116 | "--chat_format",
117 | action="store_true",
118 | help="whether to use chat format",
119 | )
120 | args = parser.parse_args()
121 | return args
122 |
123 |
124 | if __name__ == "__main__":
125 | args = arg_parser()
126 |
127 | model = Emu2(args.model_name, image_first=args.image_first)
128 | if args.image_first:
129 | args.model_name = args.model_name + "-image-first"
130 | if args.chat_format:
131 | args.model_name = args.model_name + "-chat-format"
132 | evaluate_on_mmvetv2(args, model)
133 |
--------------------------------------------------------------------------------
/v2/inference/gemini.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from pathlib import Path
4 | import argparse
5 | import json
6 | import google.generativeai as genai
7 | from utils import evaluate_on_mmvetv2
8 |
9 |
10 | class Gemini:
11 | def __init__(self, model="gemini-1.5-pro"):
12 | self.model = genai.GenerativeModel(model)
13 |
14 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
15 |
16 | content = []
17 | queries = prompt.split("
")
18 | img_num = 0
19 | for query in queries:
20 | if query.endswith((".jpg", ".png", ".jpeg")):
21 | image_path = Path(os.path.join(image_folder, query))
22 | image = {
23 | 'mime_type': f'image/{image_path.suffix[1:].replace("jpg", "jpeg")}',
24 | 'data': image_path.read_bytes()
25 | }
26 | img_num += 1
27 | content.append(image)
28 | else:
29 | content.append(query)
30 |
31 | if img_num > 16:
32 | return ""
33 | # Query the model
34 | text = ""
35 | while len(text) < 1:
36 | try:
37 | response = self.model.generate_content(
38 | content
39 | )
40 | try:
41 | text = response.text
42 | except:
43 | text = " "
44 | except Exception as error:
45 | print(error)
46 | print('Sleeping for 10 seconds')
47 | time.sleep(10)
48 | return text.strip()
49 |
50 |
51 | def arg_parser():
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument(
54 | "--mmvetv2_path",
55 | type=str,
56 | default="/path/to/mm-vet",
57 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
58 | )
59 | parser.add_argument(
60 | "--result_path",
61 | type=str,
62 | default="results",
63 | )
64 | parser.add_argument(
65 | "--google_api_key", type=str, default=None,
66 | help="refer to https://ai.google.dev/tutorials/python_quickstart"
67 | )
68 | parser.add_argument(
69 | "--model_name",
70 | type=str,
71 | default="gemini-1.5-pro",
72 | help="Gemini model name",
73 | )
74 | args = parser.parse_args()
75 | return args
76 |
77 |
78 | if __name__ == "__main__":
79 | args = arg_parser()
80 |
81 | if args.google_api_key:
82 | GOOGLE_API_KEY = args.google_api_key
83 | else:
84 | GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
85 |
86 | if GOOGLE_API_KEY is None:
87 | raise ValueError("Please set the GOOGLE_API_KEY environment variable or pass it as an argument")
88 |
89 | genai.configure(api_key=GOOGLE_API_KEY)
90 | model = Gemini(model=args.model_name)
91 |
92 | evaluate_on_mmvetv2(args, model)
93 |
94 |
95 |
--------------------------------------------------------------------------------
/v2/inference/gpt4.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import requests
4 | import argparse
5 | from utils import encode_image, evaluate_on_mmvetv2
6 |
7 |
8 |
9 |
10 | class GPT4:
11 | def __init__(self, api_key, model="gpt-4o-2024-05-13", image_detail="auto",
12 | system_text="You are a helpful assistant. Generate a short and concise response to the following image text pair."):
13 | self.api_key = api_key
14 | self.model = model
15 | self.image_detail = image_detail
16 | self.system_text = system_text
17 | self.headers = {
18 | "Content-Type": "application/json",
19 | "Authorization": f"Bearer {self.api_key}"
20 | }
21 | self.url = "https://api.openai.com/v1/chat/completions"
22 |
23 | def get_response(self, image_folder, prompt="What's in this image?"):
24 | messages = []
25 | if self.system_text is not None or self.system_text != "":
26 | messages.append({
27 | "role": "system",
28 | "content": [
29 | {
30 | "type": "text",
31 | "text": self.system_text,
32 | },
33 | ]
34 | })
35 |
36 | content = []
37 | queries = prompt.split("
")
38 | img_num = 0
39 | for query in queries:
40 | query = query.strip()
41 | if query.endswith((".jpg", ".png", ".jpeg")):
42 | image_path = os.path.join(image_folder, query)
43 | base64_image = encode_image(image_path)
44 | image_format = "data:image/png;base64" if image_path.endswith('.png') else "data:image/jpeg;base64"
45 | content.append(
46 | {
47 | "type": "image_url",
48 | "image_url": {
49 | "url": f"{image_format},{base64_image}",
50 | "detail": self.image_detail,
51 | }
52 | }
53 | )
54 | img_num += 1
55 | else:
56 | content.append(
57 | {
58 | "type": "text",
59 | "text": query
60 | },
61 | )
62 |
63 | messages.append({
64 | "role": "user",
65 | "content": content,
66 | })
67 | payload = {
68 | "model": self.model,
69 | "messages": messages,
70 | "max_tokens": 500,
71 | }
72 |
73 | response_text, retry, response_json, regular_time = '', 0, None, 30
74 | while len(response_text) < 1:
75 | retry += 1
76 | time.sleep(1)
77 | try:
78 | response = requests.post(self.url, headers=self.headers, json=payload)
79 | response_json = response.json()
80 | # print(response_json)
81 | except Exception as e:
82 | print(e)
83 | time.sleep(regular_time)
84 | continue
85 | if response.status_code != 200:
86 | print(response.headers,response.content)
87 | print(image_path)
88 | print(f"The response status code for is {response.status_code} (Not OK)")
89 | time.sleep(regular_time)
90 | continue
91 | if 'choices' not in response_json:
92 | time.sleep(regular_time)
93 | continue
94 | response_text = response_json["choices"][0]["message"]["content"]
95 | return response_json["choices"][0]["message"]["content"]
96 |
97 |
98 | def arg_parser():
99 | parser = argparse.ArgumentParser()
100 | parser.add_argument(
101 | "--mmvetv2_path",
102 | type=str,
103 | default="/path/to/mm-vet-v2",
104 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
105 | )
106 | parser.add_argument(
107 | "--result_path",
108 | type=str,
109 | default="results",
110 | )
111 | parser.add_argument(
112 | "--openai_api_key", type=str, default=None,
113 | help="refer to https://platform.openai.com/docs/quickstart?context=python"
114 | )
115 | parser.add_argument(
116 | "--model_name",
117 | type=str,
118 | default="gpt-4o-2024-05-13",
119 | help="GPT model name",
120 | )
121 | parser.add_argument(
122 | "--image_detail",
123 | type=str,
124 | default="auto",
125 | help="Refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding",
126 | )
127 | args = parser.parse_args()
128 | return args
129 |
130 |
131 | if __name__ == "__main__":
132 | args = arg_parser()
133 |
134 | if args.openai_api_key:
135 | OPENAI_API_KEY = args.openai_api_key
136 | else:
137 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
138 |
139 | if OPENAI_API_KEY is None:
140 | raise ValueError("Please set the OPENAI_API_KEY environment variable or pass it as an argument")
141 |
142 | model = GPT4(OPENAI_API_KEY, model=args.model_name, image_detail=args.image_detail)
143 | args.model_name = f"{args.model_name}_detail-{args.image_detail}"
144 |
145 | evaluate_on_mmvetv2(args, model)
146 |
--------------------------------------------------------------------------------
/v2/inference/internvl.py:
--------------------------------------------------------------------------------
1 | import random
2 | import torch
3 | from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor
4 | import os
5 | from accelerate import init_empty_weights, infer_auto_device_map
6 | import argparse
7 | from utils import evaluate_on_mmvetv2, process_images_for_question
8 |
9 |
10 | def disable_torch_init():
11 | """
12 | Disable the redundant torch default initialization to accelerate model creation.
13 | """
14 | import torch
15 |
16 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
17 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
18 |
19 |
20 | class Internvl:
21 | def __init__(
22 | self,
23 | model_name="OpenGVLab/InternVL-Chat-V1-2", # OpenGVLab/InternVL-Chat-V1-2
24 | image_first=False,
25 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
26 | chat_format=True,
27 | ):
28 | random.seed(args.seed)
29 | if args.bf16:
30 | self.torch_type = torch.bfloat16
31 | else:
32 | self.torch_type = torch.float16
33 | self.model = AutoModel.from_pretrained(
34 | model_name,
35 | torch_dtype=self.torch_type,
36 | low_cpu_mem_usage=True,
37 | trust_remote_code=True,
38 | device_map="auto",
39 | ).eval()
40 |
41 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
42 | self.temperature = 0.0
43 | self.system_message = system_message
44 | self.chat_format = chat_format
45 |
46 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
47 | images = []
48 | text_queries = []
49 | queries = prompt.split("
")
50 | for query in queries:
51 | query = query.strip()
52 | if query.endswith((".jpg", ".png", ".jpeg")):
53 | images.append(os.path.join(image_folder, query))
54 | text_queries.append("")
55 | else:
56 | text_queries.append(query)
57 | text_query = "".join(text_queries)
58 | image = process_images_for_question(images).convert("RGB")
59 | image = image.resize((448, 448))
60 | image_processor = CLIPImageProcessor.from_pretrained(self.model_name)
61 |
62 | pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
63 | pixel_values = pixel_values.to(self.torch_type).cuda()
64 |
65 | generation_config = dict(
66 | num_beams=1,
67 | max_new_tokens=1024,
68 | do_sample=True if self.temperature > 0 else False,
69 | temperature=self.temperature,
70 | length_penalty=1.0,
71 | repetition_penalty=1.2,
72 | )
73 |
74 | response = model.chat(
75 | self.tokenizer, pixel_values, text_query, generation_config
76 | )
77 | return response
78 |
79 |
80 | def arg_parser():
81 | parser = argparse.ArgumentParser()
82 | parser.add_argument(
83 | "--mmvetv2_path",
84 | type=str,
85 | default="/path/to/mm-vet-v2",
86 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
87 | )
88 | parser.add_argument(
89 | "--result_path",
90 | type=str,
91 | default="results",
92 | )
93 | parser.add_argument("--fp16", action="store_true")
94 | parser.add_argument("--bf16", action="store_true")
95 | parser.add_argument(
96 | "--model_name",
97 | type=str,
98 | default="OpenGVLab/InternVL-Chat-V1-2",
99 | help="pretrained ckpt",
100 | )
101 | parser.add_argument(
102 | "--image_first",
103 | action="store_true",
104 | help="whether text",
105 | )
106 | parser.add_argument(
107 | "--chat_format",
108 | action="store_true",
109 | help="whether to use chat format",
110 | )
111 | parser.add_argument("--seed", type=int, default=0)
112 | args = parser.parse_args()
113 | return args
114 |
115 |
116 | if __name__ == "__main__":
117 | args = arg_parser()
118 |
119 | model = Internvl(args.model_name, image_first=args.image_first)
120 | # model = None
121 | if args.image_first:
122 | args.model_name = args.model_name + "-image-first"
123 | if args.chat_format:
124 | args.model_name = args.model_name + "-chat-format"
125 | evaluate_on_mmvetv2(args, model)
126 |
--------------------------------------------------------------------------------
/v2/inference/internvl2.py:
--------------------------------------------------------------------------------
1 | import math
2 | import random
3 | import torch
4 | from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor
5 | import os
6 | from accelerate import init_empty_weights, infer_auto_device_map
7 | import argparse
8 | from utils import evaluate_on_mmvetv2, process_images_for_question
9 | import torchvision.transforms as T
10 | from PIL import Image, ImageDraw
11 | from torchvision.transforms.functional import InterpolationMode
12 |
13 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
14 | IMAGENET_STD = (0.229, 0.224, 0.225)
15 |
16 |
17 | def split_model(model_name):
18 | device_map = {}
19 | world_size = torch.cuda.device_count()
20 | num_layers = {
21 | "InternVL2-1B": 24,
22 | "InternVL2-2B": 24,
23 | "InternVL2-4B": 32,
24 | "InternVL2-8B": 32,
25 | "InternVL2-26B": 48,
26 | "InternVL2-40B": 60,
27 | "InternVL2-Llama3-76B": 80,
28 | }[model_name]
29 | # Since the first GPU will be used for ViT, treat it as half a GPU.
30 | num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
31 | num_layers_per_gpu = [num_layers_per_gpu] * world_size
32 | num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
33 | layer_cnt = 0
34 | for i, num_layer in enumerate(num_layers_per_gpu):
35 | for j in range(num_layer):
36 | device_map[f"language_model.model.layers.{layer_cnt}"] = i
37 | layer_cnt += 1
38 | device_map["vision_model"] = 0
39 | device_map["mlp1"] = 0
40 | device_map["language_model.model.tok_embeddings"] = 0
41 | device_map["language_model.model.embed_tokens"] = 0
42 | device_map["language_model.output"] = 0
43 | device_map["language_model.model.norm"] = 0
44 | device_map["language_model.lm_head"] = 0
45 | device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
46 |
47 | return device_map
48 |
49 |
50 | def build_transform(input_size):
51 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
52 | transform = T.Compose(
53 | [
54 | T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
55 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
56 | T.ToTensor(),
57 | T.Normalize(mean=MEAN, std=STD),
58 | ]
59 | )
60 | return transform
61 |
62 |
63 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
64 | best_ratio_diff = float("inf")
65 | best_ratio = (1, 1)
66 | area = width * height
67 | for ratio in target_ratios:
68 | target_aspect_ratio = ratio[0] / ratio[1]
69 | ratio_diff = abs(aspect_ratio - target_aspect_ratio)
70 | if ratio_diff < best_ratio_diff:
71 | best_ratio_diff = ratio_diff
72 | best_ratio = ratio
73 | elif ratio_diff == best_ratio_diff:
74 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
75 | best_ratio = ratio
76 | return best_ratio
77 |
78 |
79 | def dynamic_preprocess(
80 | image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
81 | ):
82 | orig_width, orig_height = image.size
83 | aspect_ratio = orig_width / orig_height
84 |
85 | # calculate the existing image aspect ratio
86 | target_ratios = set(
87 | (i, j)
88 | for n in range(min_num, max_num + 1)
89 | for i in range(1, n + 1)
90 | for j in range(1, n + 1)
91 | if i * j <= max_num and i * j >= min_num
92 | )
93 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
94 |
95 | # find the closest aspect ratio to the target
96 | target_aspect_ratio = find_closest_aspect_ratio(
97 | aspect_ratio, target_ratios, orig_width, orig_height, image_size
98 | )
99 |
100 | # calculate the target width and height
101 | target_width = image_size * target_aspect_ratio[0]
102 | target_height = image_size * target_aspect_ratio[1]
103 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
104 |
105 | # resize the image
106 | resized_img = image.resize((target_width, target_height))
107 | processed_images = []
108 | for i in range(blocks):
109 | box = (
110 | (i % (target_width // image_size)) * image_size,
111 | (i // (target_width // image_size)) * image_size,
112 | ((i % (target_width // image_size)) + 1) * image_size,
113 | ((i // (target_width // image_size)) + 1) * image_size,
114 | )
115 | # split the image
116 | split_img = resized_img.crop(box)
117 | processed_images.append(split_img)
118 | assert len(processed_images) == blocks
119 | if use_thumbnail and len(processed_images) != 1:
120 | thumbnail_img = image.resize((image_size, image_size))
121 | processed_images.append(thumbnail_img)
122 | return processed_images
123 |
124 |
125 | def load_image(image_file, input_size=448, max_num=6):
126 | if isinstance(image_file, str):
127 | image = Image.open(image_file).convert("RGB")
128 | else:
129 | image = image_file
130 | transform = build_transform(input_size=input_size)
131 | images = dynamic_preprocess(
132 | image, image_size=input_size, use_thumbnail=True, max_num=max_num
133 | )
134 | pixel_values = [transform(image) for image in images]
135 | pixel_values = torch.stack(pixel_values)
136 | return pixel_values
137 |
138 |
139 | def disable_torch_init():
140 | """
141 | Disable the redundant torch default initialization to accelerate model creation.
142 | """
143 | import torch
144 |
145 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
146 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
147 |
148 |
149 | class Internvl:
150 | def __init__(
151 | self,
152 | model_name="OpenGVLab/InternVL2-40B", # OpenGVLab/InternVL-Chat-V1-5 OpenGVLab/InternVL2-40B OpenGVLab/InternVL2-Llama3-76B
153 | image_first=False,
154 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
155 | chat_format=True,
156 | ):
157 | random.seed(args.seed)
158 | if args.bf16:
159 | self.torch_type = torch.bfloat16
160 | else:
161 | self.torch_type = torch.float16
162 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
163 | try:
164 | self.model = AutoModel.from_pretrained(
165 | pretrained_model_name_or_path=model_name,
166 | torch_dtype=self.torch_type,
167 | low_cpu_mem_usage=True,
168 | trust_remote_code=True,
169 | device_map="auto",
170 | ).eval()
171 | except Exception:
172 | device_map = split_model(
173 | model_name.split("/")[-1]
174 | ) # "InternVL2-Llama3-76B"
175 | self.model = AutoModel.from_pretrained(
176 | model_name,
177 | torch_dtype=self.torch_type,
178 | low_cpu_mem_usage=True,
179 | trust_remote_code=True,
180 | device_map=device_map,
181 | ).eval()
182 | self.tokenizer = AutoTokenizer.from_pretrained(
183 | model_name, use_fast=False, trust_remote_code=True
184 | )
185 |
186 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
187 | images = []
188 | text_queries = []
189 | queries = prompt.split("
")
190 | pixel_values = []
191 | num_patches_list = []
192 | for query in queries:
193 | query = query.strip()
194 | if query.endswith((".jpg", ".png", ".jpeg")):
195 | images.append(os.path.join(image_folder, query))
196 | pixel_values.append(
197 | load_image(os.path.join(image_folder, query), max_num=6)
198 | .to(self.torch_type)
199 | .cuda()
200 | )
201 | num_patches_list.append(pixel_values[-1].size(0))
202 | else:
203 | text_queries.append(query)
204 | text_query = "".join(text_queries)
205 | if args.unique:
206 | question = ""
207 | for i in range(len(pixel_values)):
208 | idx = i + 1
209 | question += f"Image-{idx}: \n"
210 | question += text_query
211 | else:
212 | question = f"\n{text_query}"
213 | pixel_values = torch.cat(pixel_values, dim=0)
214 | generation_config = dict(
215 | num_beams=1,
216 | max_new_tokens=512,
217 | min_new_tokens=1,
218 | do_sample=True if args.temperature > 0 else False,
219 | temperature=args.temperature,
220 | length_penalty=1.0,
221 | repetition_penalty=1.2,
222 | )
223 | try:
224 | response, history = model.chat(
225 | self.tokenizer,
226 | pixel_values,
227 | question,
228 | generation_config,
229 | num_patches_list=num_patches_list if args.unique else None,
230 | history=None,
231 | return_history=True,
232 | )
233 | except Exception as e:
234 | combined_images = process_images_for_question(images).convert("RGB")
235 | pixel_values = (
236 | load_image(combined_images, max_num=6).to(self.torch_type).cuda()
237 | )
238 | response, history = model.chat(
239 | self.tokenizer,
240 | pixel_values,
241 | question,
242 | generation_config,
243 | history=None,
244 | return_history=True,
245 | )
246 | print(f"found error: {e}, combine images to save space")
247 | return response
248 |
249 |
250 | def arg_parser():
251 | parser = argparse.ArgumentParser()
252 | parser.add_argument(
253 | "--mmvetv2_path",
254 | type=str,
255 | default="/path/to/mm-vet-v2",
256 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
257 | )
258 | parser.add_argument(
259 | "--result_path",
260 | type=str,
261 | default="results",
262 | )
263 | parser.add_argument("--fp16", action="store_true")
264 | parser.add_argument("--bf16", action="store_true")
265 | parser.add_argument(
266 | "--model_name",
267 | type=str,
268 | default="OpenGVLab/InternVL2-40B",
269 | help="pretrained ckpt",
270 | )
271 | parser.add_argument(
272 | "--image_first",
273 | action="store_true",
274 | help="whether text",
275 | )
276 | parser.add_argument(
277 | "--chat_format",
278 | action="store_true",
279 | help="whether to use chat format",
280 | )
281 | parser.add_argument("--temperature", type=float, default=0.0)
282 | parser.add_argument("--unique", action="store_true")
283 |
284 | parser.add_argument("--seed", type=int, default=0)
285 | args = parser.parse_args()
286 | return args
287 |
288 |
289 | if __name__ == "__main__":
290 | args = arg_parser()
291 |
292 | model = Internvl(args.model_name, image_first=args.image_first)
293 | if args.image_first:
294 | args.model_name = args.model_name + "-image-first"
295 | if args.chat_format:
296 | args.model_name = args.model_name + "-chat-format"
297 | evaluate_on_mmvetv2(args, model)
298 |
--------------------------------------------------------------------------------
/v2/inference/ixc2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModel, AutoTokenizer
3 | import argparse
4 |
5 | import os
6 | import torch
7 | import torchvision
8 | from PIL import Image
9 | from utils import evaluate_on_mmvetv2, process_images_for_question
10 |
11 |
12 | def auto_configure_device_map(num_gpus):
13 | # visual_encoder 算4层
14 | # internlm_model.model.embed_tokens 占用1层
15 | # norm 和 lm_head 占用1层
16 | # transformer.layers 占用 32 层
17 | # 总共34层分配到num_gpus张卡上
18 | num_trans_layers = 32
19 | per_gpu_layers = 38 / num_gpus
20 |
21 | device_map = {
22 | "vit": 0,
23 | "vision_proj": 0,
24 | "model.tok_embeddings": 0,
25 | "model.norm": num_gpus - 1,
26 | "output": num_gpus - 1,
27 | }
28 |
29 | used = 3
30 | gpu_target = 0
31 | for i in range(num_trans_layers):
32 | if used >= per_gpu_layers:
33 | gpu_target += 1
34 | used = 0
35 | assert gpu_target < num_gpus
36 | device_map[f"model.layers.{i}"] = gpu_target
37 | used += 1
38 |
39 | return device_map
40 |
41 |
42 | def model_gen_single_img(model, text, images, need_bos=True, padding=False):
43 | pt1 = 0
44 | embeds = []
45 | im_mask = []
46 | images = images
47 | images_loc = [0]
48 | for i, pts in enumerate(images_loc + [len(text)]):
49 | subtext = text[pt1:pts]
50 | if need_bos or len(subtext) > 0:
51 | text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
52 | embeds.append(text_embeds)
53 | im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
54 | need_bos = False
55 | if i < len(images):
56 | try:
57 | image = Image.open(images[i]).convert("RGB")
58 | except:
59 | image = images[i].convert("RGB")
60 | if padding:
61 | image = __padding__(image)
62 | image = model.vis_processor(image).unsqueeze(0).half().cuda()
63 | image_embeds = model.encode_img(image)
64 | embeds.append(image_embeds)
65 | im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
66 | pt1 = pts
67 | embeds = torch.cat(embeds, dim=1)
68 | im_mask = torch.cat(im_mask, dim=1)
69 | im_mask = im_mask.bool()
70 |
71 | outputs = model.generate(
72 | inputs_embeds=embeds,
73 | im_mask=im_mask,
74 | temperature=1.0,
75 | max_new_tokens=4096,
76 | num_beams=3,
77 | do_sample=False,
78 | repetition_penalty=1.0,
79 | )
80 |
81 | output_token = outputs[0]
82 | if output_token[0] == 0 or output_token[0] == 1:
83 | output_token = output_token[1:]
84 | output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
85 | output_text = output_text.split("[UNUSED_TOKEN_145]")[0].strip()
86 | return output_text
87 |
88 |
89 | def model_gen_multi_img(model, text, images, need_bos=True, padding=False):
90 | embeds = []
91 | im_mask = []
92 | images = images
93 | for i, pts in enumerate(text):
94 | text_embeds = model.encode_text(
95 | pts, add_special_tokens=need_bos if i == 0 else False
96 | )
97 | embeds.append(text_embeds)
98 | im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
99 | if i < len(images):
100 | assert os.path.exists(images[i])
101 | try:
102 | image = Image.open(images[i]).convert("RGB")
103 | except:
104 | image = images[i].convert("RGB")
105 | if padding:
106 | image = __padding__(image)
107 | image = model.vis_processor(image).unsqueeze(0).cuda()
108 | image_embeds = model.encode_img(image)
109 | embeds.append(image_embeds)
110 | im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
111 | embeds = torch.cat(embeds, dim=1)
112 | im_mask = torch.cat(im_mask, dim=1)
113 | im_mask = im_mask.bool()
114 | outputs = model.generate(
115 | inputs_embeds=embeds,
116 | im_mask=im_mask,
117 | temperature=1.0,
118 | max_new_tokens=4096,
119 | num_beams=3,
120 | do_sample=False,
121 | repetition_penalty=1.0,
122 | )
123 | output_token = outputs[0]
124 | if output_token[0] == 0 or output_token[0] == 1:
125 | output_token = output_token[1:]
126 | output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
127 | output_text = output_text.split("[UNUSED_TOKEN_145]")[0].strip()
128 | return output_text
129 |
130 |
131 | def __padding__(image):
132 | width, height = image.size
133 | tar = max(width, height)
134 | top_padding = int((tar - height) / 2)
135 | bottom_padding = tar - height - top_padding
136 | left_padding = int((tar - width) / 2)
137 | right_padding = tar - width - left_padding
138 | image = torchvision.transforms.functional.pad(
139 | image, [left_padding, top_padding, right_padding, bottom_padding]
140 | )
141 | return image
142 |
143 |
144 | class InternLM_XComposer2_VL:
145 | def __init__(
146 | self,
147 | model_name="internlm/internlm-xcomposer2-vl-7b",
148 | image_first=False,
149 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.",
150 | chat_format=True,
151 | ):
152 | self.model = AutoModel.from_pretrained(
153 | model_name, trust_remote_code=True
154 | ).eval()
155 |
156 | if args.dtype == "fp16":
157 | self.model.half().cuda()
158 | elif args.dtype == "fp32":
159 | self.model.cuda()
160 |
161 | if args.num_gpus > 1:
162 | from accelerate import dispatch_model
163 |
164 | device_map = auto_configure_device_map(args.num_gpus)
165 | self.model = dispatch_model(self.model, device_map=device_map)
166 |
167 | self.tokenizer = AutoTokenizer.from_pretrained(
168 | model_name, trust_remote_code=True
169 | )
170 | self.model.tokenizer = self.tokenizer
171 | self.system_message = system_message
172 | self.chat_format = chat_format
173 |
174 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
175 | images = []
176 | text_queries = []
177 | queries = prompt.split("
")
178 | for query in queries:
179 | query = query.strip()
180 | if query.endswith((".jpg", ".png", ".jpeg")):
181 | images.append(os.path.join(image_folder, query))
182 | text_queries.append("")
183 | else:
184 | text_queries.append(query)
185 | if args.combine_imgs:
186 | text_query = "".join(text_queries)
187 | text = "[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n".format(
188 | self.system_message, text_query
189 | )
190 | image = [process_images_for_question(images)]
191 | response = model_gen_single_img(
192 | model=self.model,
193 | text=text,
194 | images=image,
195 | )
196 | else:
197 | text_query = (
198 | [
199 | "[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n".format(
200 | self.system_message
201 | )
202 | ]
203 | + text_queries
204 | + [
205 | "{}Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
206 | ]
207 | )
208 | with torch.cuda.amp.autocast():
209 | response = model_gen_multi_img(
210 | model=self.model, text=text_query, images=images
211 | )
212 | return response
213 |
214 |
215 | def arg_parser():
216 | parser = argparse.ArgumentParser()
217 | parser.add_argument(
218 | "--mmvetv2_path",
219 | type=str,
220 | default="/path/to/mm-vet-v2",
221 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
222 | )
223 | parser.add_argument(
224 | "--result_path",
225 | type=str,
226 | default="results",
227 | )
228 | parser.add_argument(
229 | "--model_name",
230 | type=str,
231 | default="internlm/internlm-xcomposer2-vl-7b",
232 | help="pretrained ckpt",
233 | )
234 | parser.add_argument(
235 | "--image_first",
236 | action="store_true",
237 | help="whether text",
238 | )
239 | parser.add_argument(
240 | "--chat_format",
241 | action="store_true",
242 | help="whether to use chat format",
243 | )
244 | parser.add_argument(
245 | "--combine_imgs",
246 | action="store_true",
247 | help="whether to use chat format",
248 | )
249 | parser.add_argument("--num_gpus", default=2, type=int)
250 | parser.add_argument("--dtype", default="fp16", type=str)
251 | args = parser.parse_args()
252 | return args
253 |
254 |
255 | if __name__ == "__main__":
256 | args = arg_parser()
257 | meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
258 | - InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
259 | - InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.
260 | - InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image."""
261 |
262 | model = InternLM_XComposer2_VL(
263 | args.model_name, image_first=args.image_first, system_message=meta_instruction
264 | )
265 | if args.image_first:
266 | args.model_name = args.model_name + "-image-first"
267 | if args.chat_format:
268 | args.model_name = args.model_name + "-chat-format"
269 | evaluate_on_mmvetv2(args, model)
270 |
--------------------------------------------------------------------------------
/v2/inference/open_flamingo.py:
--------------------------------------------------------------------------------
1 | from open_flamingo import create_model_and_transforms
2 | from PIL import Image
3 | import torch
4 | import os
5 | import argparse
6 | from utils import evaluate_on_mmvetv2
7 |
8 |
9 | class OpenFlamingo:
10 | def __init__(self, model_name='open-flamingo-9b'):
11 | if model_name == 'open-flamingo-9b':
12 | clip_vision_encoder_path="ViT-L-14"
13 | clip_vision_encoder_pretrained="openai"
14 | lang_encoder_path="anas-awadalla/mpt-7b"
15 | tokenizer_path="anas-awadalla/mpt-7b"
16 | cross_attn_every_n_layers=4
17 | self.model, self.image_processor, self.tokenizer = create_model_and_transforms(
18 | clip_vision_encoder_path=clip_vision_encoder_path,
19 | clip_vision_encoder_pretrained=clip_vision_encoder_pretrained,
20 | lang_encoder_path=lang_encoder_path,
21 | tokenizer_path=tokenizer_path,
22 | cross_attn_every_n_layers=cross_attn_every_n_layers,
23 | )
24 |
25 | self.tokenizer.padding_side = "left" # For generation padding tokens should be on the left
26 |
27 | def get_response(self, image_folder, prompt="What's in this image?") -> str:
28 | vision_x = []
29 | text_query = ""
30 | queries = prompt.split("
")
31 | for query in queries:
32 | query = query.strip()
33 | if query.endswith((".jpg", ".png", ".jpeg")):
34 | image_path = os.path.join(image_folder, query)
35 | image = Image.open(image_path).convert('RGB')
36 | vision_x.append(self.image_processor(image).unsqueeze(0))
37 | text_query += ""
38 | else:
39 | text_query += query
40 |
41 | vision_x = torch.cat(vision_x, dim=0)
42 | vision_x = vision_x.unsqueeze(1).unsqueeze(0)
43 |
44 | lang_x = self.tokenizer(
45 | [text_query],
46 | return_tensors="pt",
47 | )
48 |
49 | generated_text = self.model.generate(
50 | vision_x=vision_x,
51 | lang_x=lang_x["input_ids"],
52 | attention_mask=lang_x["attention_mask"],
53 | max_new_tokens=512,
54 | num_beams=3,
55 | )
56 |
57 | response_text = self.tokenizer.decode(generated_text[0])
58 | return response_text.strip()
59 |
60 |
61 | def arg_parser():
62 | parser = argparse.ArgumentParser()
63 | parser.add_argument(
64 | "--mmvetv2_path",
65 | type=str,
66 | default="/path/to/mm-vet-v2",
67 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
68 | )
69 | parser.add_argument(
70 | "--result_path",
71 | type=str,
72 | default="results",
73 | )
74 | parser.add_argument(
75 | "--model_name",
76 | type=str,
77 | default="open-flamingo-9b",
78 | help="Open Flamingo model name",
79 | )
80 | args = parser.parse_args()
81 | return args
82 |
83 | if __name__ == "__main__":
84 | args = arg_parser()
85 |
86 | model = OpenFlamingo(args.model_name)
87 | evaluate_on_mmvetv2(args, model)
88 |
--------------------------------------------------------------------------------
/v2/inference/qwen.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 | import os
4 | import base64
5 | import requests
6 | import argparse
7 | from utils import evaluate_on_mmvetv2
8 | from http import HTTPStatus
9 | import dashscope
10 |
11 |
12 | class Qwen:
13 | def __init__(self, model='qwen-vl-max'):
14 | self.model = model
15 |
16 | def get_response(self, image_folder, prompt="What's in this image?"):
17 | messages = []
18 | content = []
19 | queries = prompt.split("
")
20 | img_num = 0
21 | for query in queries:
22 | query = query.strip()
23 | if query == "":
24 | continue
25 | if query.endswith((".jpg", ".png", ".jpeg")):
26 | image_path = os.path.join(image_folder, query)
27 | content.append(
28 | {
29 | "image": f"file://{image_path}"
30 | }
31 | )
32 | img_num += 1
33 | else:
34 | content.append(
35 | {
36 | "text": query
37 | },
38 | )
39 |
40 | messages.append({
41 | "role": "user",
42 | "content": content,
43 | })
44 |
45 | payload = {
46 | "model": self.model,
47 | "messages": messages,
48 | }
49 |
50 |
51 | response = dashscope.MultiModalConversation.call(**payload)
52 | if response.status_code == HTTPStatus.OK:
53 | rps = response['output']['choices'][0]['message']['content']
54 | for rp in rps:
55 | if 'text' in rp:
56 | response_text = rp['text']
57 | return response_text.strip()
58 | else:
59 | print(response.code) # The error code.
60 | print(response.message) # The error message.
61 | return ""
62 |
63 | def arg_parser():
64 | parser = argparse.ArgumentParser()
65 | parser.add_argument(
66 | "--mmvetv2_path",
67 | type=str,
68 | default="/path/to/mm-vet-v2",
69 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
70 | )
71 | parser.add_argument(
72 | "--result_path",
73 | type=str,
74 | default="results",
75 | )
76 | parser.add_argument(
77 | "--dashscope_api_key", type=str, default=None,
78 | help="refer to https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start"
79 | )
80 | parser.add_argument(
81 | "--model_name",
82 | type=str,
83 | default="qwen-vl-max",
84 | help="Qwen model name",
85 | )
86 | args = parser.parse_args()
87 | return args
88 |
89 |
90 | if __name__ == "__main__":
91 | args = arg_parser()
92 |
93 | if args.dashscope_api_key:
94 | DASHSCOPE_API_KEY = args.dashscope_api_key
95 | else:
96 | DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY')
97 |
98 | if DASHSCOPE_API_KEY is None:
99 | raise ValueError("Please set the DASHSCOPE_API_KEY environment variable or pass it as an argument")
100 |
101 | model = Qwen(model=args.model_name)
102 |
103 | evaluate_on_mmvetv2(args, model)
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/v2/inference/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import json
4 | import base64
5 | from PIL import Image, ImageDraw
6 |
7 |
8 | def process_images_for_question(images, key=None):
9 | images = [Image.open(path) for path in images] #
10 | if not images:
11 | return #
12 | n = len(images)
13 | grid_cols = math.ceil(math.sqrt(n))
14 | grid_rows = math.ceil(n / grid_cols)
15 |
16 | #
17 | max_width = max(img.width for img in images)
18 | max_height = max(img.height for img in images)
19 | cell_width = max_width + 20 # add gap
20 | cell_height = max_height + 30 #
21 |
22 | #
23 | collage_width = cell_width * grid_cols
24 | collage_height = cell_height * grid_rows
25 | collage = Image.new("RGB", (collage_width, collage_height), "white")
26 | draw = ImageDraw.Draw(collage)
27 |
28 | for index, img in enumerate(images):
29 | row, col = divmod(index, grid_cols)
30 | x = col * cell_width + (cell_width - img.width) // 2
31 | y = row * cell_height + (cell_height - img.height - 10) // 2 #
32 | collage.paste(img, (x, y + 20)) #
33 |
34 | # add img id
35 | draw.text((x + img.width // 2, y), str(index + 1), fill="black")
36 |
37 | return collage
38 |
39 |
40 | # Function to encode the image
41 | def encode_image(image_path):
42 | with open(image_path, "rb") as image_file:
43 | return base64.b64encode(image_file.read()).decode("utf-8")
44 |
45 |
46 | def evaluate_on_mmvetv2(args, model):
47 | if os.path.exists(args.result_path) is False:
48 | os.makedirs(args.result_path)
49 |
50 | model_name = args.model_name.replace("/", "--")
51 | results_path = os.path.join(args.result_path, f"{model_name}.json")
52 | image_folder = os.path.join(args.mmvetv2_path, "images")
53 | meta_data = os.path.join(args.mmvetv2_path, "mm-vet-v2.json")
54 |
55 | if os.path.exists(results_path):
56 | with open(results_path, "r") as f:
57 | results = json.load(f)
58 | else:
59 | results = {}
60 |
61 | with open(meta_data, "r") as f:
62 | data = json.load(f)
63 |
64 | for i in range(len(data)):
65 | id = f"v2_{i}"
66 | if id in results:
67 | continue
68 | prompt = data[id]["question"].strip()
69 | print(id)
70 | print(f"Prompt: {prompt}")
71 | try:
72 | response = model.get_response(image_folder, prompt)
73 | except:
74 | response = ""
75 | print(f"Response: {response}")
76 | results[id] = response
77 | with open(results_path, "w") as f:
78 | json.dump(results, f, indent=4)
79 |
--------------------------------------------------------------------------------
/v2/mm-vet-v2_evaluator.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from openai import OpenAI
3 | from openai._exceptions import RateLimitError, BadRequestError
4 | import json
5 | import os
6 | from tqdm import tqdm
7 | import pandas as pd
8 | import numpy as np
9 | from collections import Counter
10 | import time
11 | import pathlib
12 |
13 | prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the question indicates where an image is. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score.
14 |
15 | | Question | Ground truth | Prediction | Correctness |
16 | | --- | --- | --- | --- |
17 | | What is x in the equation? | -1 -5 | x = 3 | 0.0 |
18 | | What is x in the equation? | -1 -5 | x = -1 | 0.5 |
19 | | What is x in the equation? | -1 -5 | x = -5 | 0.5 |
20 | | What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5 |
21 | | What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0 |
22 | | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4 |
23 | | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0 |
24 | """
25 |
26 |
27 | def arg_parser(prompt=prompt):
28 | parser = argparse.ArgumentParser()
29 | parser.add_argument(
30 | "--mmvetv2_path",
31 | type=str,
32 | default="/path/to/mm-vet-v2",
33 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here",
34 | )
35 | parser.add_argument(
36 | "--result_file",
37 | type=str,
38 | default="results/llava_llama2_13b_chat.json",
39 | help="path to the model result file, must end with .json",
40 | )
41 | parser.add_argument(
42 | "--result_path",
43 | type=str,
44 | default="results",
45 | help="path to save the grading results",
46 | )
47 | parser.add_argument(
48 | "--openai_api_key", type=str, default=None,
49 | help="If not specified, use OPENAI_API_KEY environment variable."
50 | )
51 | parser.add_argument(
52 | "--gpt_model", type=str, default="gpt-4-0613", help="gpt model name"
53 | )
54 | parser.add_argument(
55 | "--prompt", type=str, default=prompt, help="prompt for the model"
56 | )
57 | parser.add_argument(
58 | "--subset",
59 | type=str,
60 | default=None,
61 | help="path to json where contains ids to evaluate",
62 | )
63 | parser.add_argument(
64 | "--decimal_places",
65 | type=int,
66 | default=1,
67 | help="number of decimal places to round to",
68 | )
69 | parser.add_argument(
70 | "--num_run",
71 | type=int,
72 | default=1,
73 | help="we set it as 5 in the paper",
74 | )
75 | args = parser.parse_args()
76 | return args
77 |
78 | def get_file_names(args, model, subset_name):
79 | # grade results for each sample to svae
80 | grade_file = f"{model}_{args.gpt_model}-grade-{args.num_run}runs_dev8.json"
81 | grade_file = os.path.join(args.result_path, grade_file)
82 |
83 | # score results regarding capabilities/capability integration to save
84 | cap_score_file = (
85 | f"{model}_{subset_name}{args.gpt_model}-cap-score-{args.num_run}runs_dev8.csv"
86 | )
87 | cap_score_file = os.path.join(args.result_path, cap_score_file)
88 | cap_int_score_file = f"{model}_{subset_name}{args.gpt_model}-cap-int-score-{args.num_run}runs_dev8.csv"
89 | cap_int_score_file = os.path.join(args.result_path, cap_int_score_file)
90 | return grade_file, cap_score_file, cap_int_score_file
91 |
92 |
93 | def load_metadata(args):
94 | if args.subset:
95 | with open(args.subset, "r") as f:
96 | subset = json.load(f)
97 |
98 | subset_name = pathlib.Path(args.subset).stem
99 | subset_name = subset_name + "_"
100 | else:
101 | subset = None
102 | subset_name = ""
103 |
104 | mmvet_metadata = os.path.join(args.mmvetv2_path, "mm-vet-v2.json")
105 | with open(mmvet_metadata, "r") as f:
106 | data = json.load(f)
107 |
108 | counter = Counter()
109 | cap_set_list = []
110 | cap_set_counter = []
111 | len_data = 0
112 | for id, value in data.items():
113 | if subset is not None and id not in subset:
114 | continue
115 | cap = value["capability"]
116 | cap = set(cap)
117 | counter.update(cap)
118 | if cap not in cap_set_list:
119 | cap_set_list.append(cap)
120 | cap_set_counter.append(1)
121 | else:
122 | cap_set_counter[cap_set_list.index(cap)] += 1
123 |
124 | len_data += 1
125 |
126 | sorted_list = counter.most_common()
127 | columns = [k for k, v in sorted_list]
128 | columns.append("total")
129 | columns.append("std")
130 | columns.append("runs")
131 | df = pd.DataFrame(columns=columns)
132 |
133 | cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter))
134 | new_cap_set_list = []
135 | new_cap_set_counter = []
136 | for index in cap_set_sorted_indices:
137 | new_cap_set_list.append(cap_set_list[index])
138 | new_cap_set_counter.append(cap_set_counter[index])
139 |
140 | cap_set_list = new_cap_set_list
141 | cap_set_counter = new_cap_set_counter
142 | cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list]
143 |
144 | columns2 = cap_set_names
145 | columns2.append("total")
146 | columns2.append("std")
147 | columns2.append("runs")
148 | df2 = pd.DataFrame(columns=columns2)
149 | return (
150 | subset,
151 | subset_name,
152 | data,
153 | counter,
154 | cap_set_list,
155 | cap_set_counter,
156 | len_data,
157 | df,
158 | df2,
159 | cap_set_names,
160 | )
161 |
162 |
163 | def runs(
164 | args,
165 | grade_file,
166 | data,
167 | len_data,
168 | subset=None,
169 | ):
170 | with open(args.result_file) as f:
171 | results = json.load(f)
172 | if os.path.exists(grade_file):
173 | with open(grade_file, "r") as f:
174 | grade_results = json.load(f)
175 | else:
176 | grade_results = {}
177 |
178 | def need_more_runs(args, grade_results, len_data):
179 | need_more_runs = False
180 | if len(grade_results) > 0:
181 | for k, v in grade_results.items():
182 | if len(v["score"]) < args.num_run:
183 | need_more_runs = True
184 | break
185 | return need_more_runs or len(grade_results) < len_data
186 |
187 | while need_more_runs(args, grade_results, len_data):
188 | for j in range(args.num_run):
189 | print(f"eval run {j}")
190 | for id, line in tqdm(data.items()):
191 | if subset is not None and id not in subset:
192 | continue
193 | if id in grade_results and len(grade_results[id]["score"]) >= (j + 1):
194 | continue
195 |
196 | model_pred = results[id]
197 | queries = line['question'].split('
')
198 | query = ""
199 | for q in queries:
200 | if q.endswith((".jpg", "jpeg", ".png")):
201 | query += ""
202 | else:
203 | query += q
204 | question = prompt + '| ' + ' | '.join([query.replace('\n', '
'), line['answer'].replace("", " ").replace("", " ").replace('\n', '
'), model_pred.replace('\n', '
'), ""])
205 | # print(question)
206 | messages = [
207 | {"role": "user", "content": question},
208 | ]
209 |
210 | if id not in grade_results:
211 | sample_grade = {"model": [], "content": [], "score": []}
212 | else:
213 | sample_grade = grade_results[id]
214 |
215 | grade_sample_run_complete = False
216 | temperature = 0.0
217 |
218 | while not grade_sample_run_complete:
219 | try:
220 | response = client.chat.completions.create(
221 | model=args.gpt_model,
222 | max_tokens=3,
223 | temperature=temperature,
224 | messages=messages,
225 | )
226 | content = response.choices[0].message.content
227 | # print(content)
228 | flag = True
229 | try_time = 1
230 | while flag:
231 | try:
232 | content = content.split(" ")[0].strip()
233 | score = float(content)
234 | if score > 1.0 or score < 0.0:
235 | assert False
236 | flag = False
237 | except:
238 | question_try = question + "\n\nPredict the correctness of the answer (digit): "
239 | messages = [
240 | {"role": "user", "content": question_try},
241 | ]
242 | # print(question_try)
243 | response = client.chat.completions.create(
244 | model=args.gpt_model,
245 | max_tokens=3,
246 | temperature=temperature,
247 | messages=messages,
248 | )
249 | content = response.choices[0].message.content
250 | # print(content)
251 | try_time += 1
252 | temperature += 0.5
253 | print(f"{id} try {try_time} times")
254 | print(content)
255 | if try_time > 5:
256 | score = 0.0
257 | flag = False
258 | grade_sample_run_complete = True
259 | response_model = response.model
260 | except RateLimitError as e:
261 | # gpt4 may have token rate limit
262 | print("sleep 30s")
263 | time.sleep(30)
264 | except BadRequestError as e:
265 | content = "BadRequestError"
266 | score = 0.0
267 | flag = False
268 | print(id, "BadRequestError")
269 | response_model = args.gpt_model
270 | break
271 |
272 | if len(sample_grade["model"]) >= j + 1:
273 | sample_grade["model"][j] = response_model
274 | sample_grade["content"][j] = content
275 | sample_grade["score"][j] = score
276 | else:
277 | sample_grade["model"].append(response_model)
278 | sample_grade["content"].append(content)
279 | sample_grade["score"].append(score)
280 | grade_results[id] = sample_grade
281 |
282 | with open(grade_file, "w") as f:
283 | json.dump(grade_results, f, indent=4)
284 |
285 | return grade_results
286 |
287 |
288 | def export_result(args, model, df, df2, grade_results, data, cap_set_counter, cap_set_names):
289 | columns = df.columns
290 | columns2 = df2.columns
291 |
292 | cap_socres = {k: [0.0] * args.num_run for k in columns[:-2]}
293 | counter["total"] = len_data
294 |
295 | cap_socres2 = {k: [0.0] * args.num_run for k in columns2[:-2]}
296 | counter2 = {columns2[i]: cap_set_counter[i] for i in range(len(cap_set_counter))}
297 | counter2["total"] = len_data
298 |
299 | for k, v in grade_results.items():
300 | if subset is not None and k not in subset:
301 | continue
302 | for i in range(args.num_run):
303 | score = v["score"][i]
304 | caps = set(data[k]["capability"])
305 | for c in caps:
306 | cap_socres[c][i] += score
307 |
308 | cap_socres["total"][i] += score
309 |
310 | index = cap_set_list.index(caps)
311 | cap_socres2[cap_set_names[index]][i] += score
312 | cap_socres2["total"][i] += score
313 |
314 | for k, v in cap_socres.items():
315 | cap_socres[k] = np.array(v) / counter[k] * 100
316 |
317 | std = round(cap_socres["total"].std(), args.decimal_places)
318 | total_copy = cap_socres["total"].copy()
319 | runs = str(list(np.round(total_copy, args.decimal_places)))
320 |
321 | for k, v in cap_socres.items():
322 | cap_socres[k] = round(v.mean(), args.decimal_places)
323 |
324 | cap_socres["std"] = std
325 | cap_socres["runs"] = runs
326 | df.loc[model] = cap_socres
327 |
328 | for k, v in cap_socres2.items():
329 | cap_socres2[k] = round(
330 | np.mean(np.array(v) / counter2[k] * 100), args.decimal_places
331 | )
332 | cap_socres2["std"] = std
333 | cap_socres2["runs"] = runs
334 | df2.loc[model] = cap_socres2
335 |
336 | df.to_csv(cap_score_file)
337 | df2.to_csv(cap_int_score_file)
338 |
339 | return df, df2
340 |
341 |
342 | if __name__ == "__main__":
343 | args = arg_parser()
344 |
345 | if args.openai_api_key:
346 | OPENAI_API_KEY = args.openai_api_key
347 | else:
348 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
349 | client = OpenAI(
350 | api_key=OPENAI_API_KEY
351 | )
352 |
353 | if os.path.exists(args.result_file) is False:
354 | import pdb; pdb.set_trace()
355 | raise ValueError("Result file does not exist")
356 | if not args.result_file.endswith(('.json', '.JSON')):
357 | raise ValueError("Result file should be a json file")
358 | model = pathlib.Path(args.result_file).stem
359 |
360 | metadata = load_metadata(args)
361 | (
362 | subset,
363 | subset_name,
364 | data,
365 | counter,
366 | cap_set_list,
367 | cap_set_counter,
368 | len_data,
369 | df,
370 | df2,
371 | cap_set_names,
372 | ) = metadata
373 | file_names = get_file_names(args, model, subset_name)
374 | (
375 | grade_file,
376 | cap_score_file,
377 | cap_int_score_file,
378 | ) = file_names
379 | grade_results = runs(
380 | args,
381 | grade_file,
382 | data,
383 | len_data,
384 | subset,
385 | )
386 | df, df2 = export_result(
387 | args,
388 | model,
389 | df,
390 | df2,
391 | grade_results,
392 | data,
393 | cap_set_counter,
394 | cap_set_names,
395 | )
396 | print(df)
397 | print("\n")
398 | print(df2)
399 | print("\n")
400 | print(f"Grading results are saved in:\n{grade_file}\n{cap_score_file}\n{cap_int_score_file}")
--------------------------------------------------------------------------------