├── .gitignore ├── LICENSE ├── README.md ├── README_more_samples.md ├── inference ├── bard.py ├── bard.sh ├── claude.py ├── gemini_vision.py ├── gpt4v.py ├── qwen.py └── utils.py ├── mm-vet_evaluator.ipynb ├── mm-vet_evaluator.py ├── results └── llava_llama2_13b_chat.json └── v2 ├── README.md ├── inference ├── claude.py ├── cogagent.py ├── cogvlm.py ├── emu2.py ├── gemini.py ├── gpt4.py ├── internvl.py ├── internvl2.py ├── ixc2.py ├── open_flamingo.py ├── qwen.py └── utils.py ├── mm-vet-v2_evaluator.py └── results ├── claude-3-5-sonnet-20240620.json ├── gemini-1.5-pro.json └── gpt-4o-2024-05-13_detail-high.json /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 |

4 | 5 | 6 | # [MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490) (ICML 2024) 7 | # [MM-Vet v2](v2/) 8 | 9 |

10 | [Paper] 11 | [Download Dataset] 12 | [Dataset on Hugging Face] 13 | [Leaderboard] 14 | [Online Evaluator] 15 |

16 | 17 | 18 | 2024/08/02: :fire: :fire: We release [**MM-Vet v2**](v2/), the extension of MM-Vet, which includes a new vision-langauge capability called "image-text sequence understanding", and expands the evaluation set size while maintaining the high quality. 19 | 20 | 21 | 2024/03/17: :fire: :fire: We release inferences scripts for Qwen-VL and Claude. Qwen-VL-Max and Claude 3 Opus achieve 66.6% and 58.1%, respectively. 22 | 23 | 24 | 2023/12/23: :fire: :fire: We release inferences scripts for GPT-4V and Gemini. Gemini Pro Vision achieves 64.3% score. 25 | 26 | 27 | 2023/10/24 :fire: :fire: We evaluate GPT-4V on MM-Vet and observe that it achieves 67.7% score, outperforming other methods with large margin (20%). However, it still has a large gap to the full mark (100%), indicating the need for efforts to further improve the integrated capabilities of LMMs. See [leaderboard](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet), [updated paper](https://arxiv.org/abs/2308.02490) and [GPT-4V prediction examples](#gpt-4v-prediction-examples). 28 | 29 | 30 | 2023/10/07 :fire: :fire: **We released [MM-Vet leaderboard](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) on paperswithcode.com where you can add your model results conveniently.** Note that date here means model date instead of paper date because some improved model versions are released after the paper. 31 | 32 | In this repo, we offer data and evaluator of MM-Vet, proposed by our paper "[MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2308.02490)". The code is under the Apache 2.0 license, and the dataset is under the CC BY-NC 4.0 license. 33 | 34 | 35 | ![MM-Vet](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258206593-c61d21c8-cd9b-440c-9592-958d530bf122.png) 36 | Figure 1: Different from conventional VL benchmarks only require one or two capabilities, MM-Vet focuses on the integration of different core VL capabilities, including recognition, OCR, knowledge, language generation, spatial awareness, and math. 37 | 38 | ## Evalute your model on MM-Vet 39 | **Step 0**: Install openai package with `pip install openai>=1` and get access GPT-4/GPT-3.5 API. If you have not access, you can try MM-Vet online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator) (but it may wait for long time depending on number of users). 40 | 41 | **Step 1**: Download MM-Vet data [here](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) and unzip `unzip mm-vet.zip`. 42 | 43 | **Step 2**: Infer your model on MM-Vet and save your model outputs in json like [llava_llama2_13b_chat.json](results/llava_llama2_13b_chat.json), or just use [llava_llama2_13b_chat.json](results/llava_llama2_13b_chat.json) as example to evalute. We also release inference scripts for GPT-4V and Gemini. 44 | 45 | ```bash 46 | image_detail=high # or auto, low refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding 47 | 48 | python inference/gpt4v.py --mmvet_path /path/to/mm-vet --image_detail ${image_detail} 49 | ``` 50 | 51 | ```bash 52 | python inference/gemini_vision.py --mmvet_path /path/to/mm-vet 53 | ``` 54 | 55 | **Step 3**: `git clone https://github.com/yuweihao/MM-Vet.git && cd MM-Vet`, run LLM-based evaluator in [mm-vet_evaluator.ipynb](mm-vet_evaluator.ipynb) or [mm-vet_evaluator.py](mm-vet_evaluator.py) (Thanks to @HireTheHero to arrange it into py version). 56 | ```bash 57 | python mm-vet_evaluator.py --mmvet_path /path/to/mm-vet --result_file results/llava_llama2_13b_chat.json 58 | ``` 59 | If you cannot access GPT-4 (gpt-4-0613), you can upload your model output results (json file) to MM-Vet online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet_Evaluator) to get the grading results. 60 | 61 | 62 | ## Citation 63 | ``` 64 | @inproceedings{yu2024mm, 65 | title={Mm-vet: Evaluating large multimodal models for integrated capabilities}, 66 | author={Yu, Weihao and Yang, Zhengyuan and Li, Linjie and Wang, Jianfeng and Lin, Kevin and Liu, Zicheng and Wang, Xinchao and Wang, Lijuan}, 67 | booktitle={International conference on machine learning}, 68 | year={2024}, 69 | organization={PMLR} 70 | } 71 | ``` 72 | 73 | ## GPT-4V Prediction Examples 74 | ![GPT-4V prediction examples](https://github-production-user-asset-6210df.s3.amazonaws.com/15921929/277925013-daf61a18-5472-4064-88a1-d1c741bed018.png) 75 | 76 | ![GPT-4V prediction examples](https://github-production-user-asset-6210df.s3.amazonaws.com/15921929/277925959-269ff3fa-1f03-43b8-a497-60ee9e236eb3.png) 77 | 78 | ![GPT-4V prediction examples](https://github-production-user-asset-6210df.s3.amazonaws.com/15921929/277926586-0318ad3b-20b7-4995-b2bd-659e76b928fd.png) 79 | 80 | 81 | 82 | ## About running Bard 83 | Please refer to these two files: [inference_bard.sh](misc/inference_bard.sh) and [inference_bard.py](misc/inference_bard.py). 84 | 85 | 86 | ## Some samples on MM-Vet 87 | ![MM-Vet example 1](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258215482-61d2e5d8-51c9-45ed-b35b-8e6c64bb26d8.jpg) 88 | 89 | **Q**: What occasions would someone use this meme? 90 | 91 | **GT**: This meme, commonly known as "Screaming Panda," is typically used to express shock, surprise, or fear. It could be used in response to a startling or unexpected event, or to convey a sense of panic or alarm. Some possible occasions where someone might use this meme include: 92 | 93 | - Reacting to a jump scare in a horror movie 94 | - Responding to a surprising plot twist in a TV show or book 95 | - Expressing shock at a news headline or current event 96 | - Conveying fear or anxiety about an upcoming deadline or exam 97 | - Showing surprise at an unexpected outcome in a sports game or other competition. 98 | 99 | **Required capabilities**: Recognition, knowledge, language generation 100 | 101 | --- 102 | 103 | ![MM-Vet example 2](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258216998-2d3850e2-88cb-43e7-89c9-0e4cd50561fa.jpg) 104 | 105 | **Q**: How many tomatoes are there? 106 | 107 | **GT**: 5 108 | 109 | **Required capabilities**: Recognition 110 | 111 | --- 112 | 113 | ![MM-Vet example 3](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258217402-a9efcb58-d4e9-453d-8d9f-032c546c50fe.jpg) 114 | 115 | **Q**: What is located to the right of the shampoo? 116 | 117 | **GT**: conditioner 118 | 119 | **Required capabilities**: OCR, spatial awareness 120 | 121 | --- 122 | 123 | ![MM-Vet example 4](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258218056-c792ca0b-f0e9-4f1c-b6ea-b48ab5f1df0e.jpg) 124 | 125 | **Q**: Which room is bigger, the double garage or the living room? 126 | 127 | **GT**: double garage 128 | 129 | **Required capabilities**: OCR, spatial awareness, math 130 | 131 | --- 132 | 133 | ![MM-Vet example 5](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258218959-dea08352-d2ce-4b53-a201-899564c7ca73.jpg) 134 | 135 | **Q**: On the right desk, what is to the left of the laptop? 136 | 137 | **GT**: table lamp \ desk lamp 138 | 139 | **Required capabilities**: Recognition, spatial awareness 140 | 141 | --- 142 | 143 | ![MM-Vet example 6](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258219954-3581fa33-2a19-411c-93b3-b757515757be.jpg) 144 | 145 | **Q**: What are all the scene text in the image? 146 | 147 | **GT**: 5:30PM\88%\Mario Kart 8 Deluxe\MARIO KART 8 DELUXE\SUPER MARIO ODYSSEY\THE LEGEND OF ZELDA\BREATH OF WILD\Options\Start 148 | 149 | **Required capabilities**: OCR 150 | 151 | --- 152 | 153 | ![MM-Vet example 7](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258220686-66f92663-fdd0-49aa-9920-1d9e3d60f1cd.png) 154 | 155 | **Q**: How many gallons of supreme gasoline can I get with $50? 156 | 157 | **GT**: 13.6 \ 13.7 158 | 159 | **Required capabilities**: OCR, math 160 | 161 | --- 162 | 163 | ![MM-Vet example 8](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258221079-3836286e-2ba3-4c0f-b54d-f87ccd1a04b1.png) 164 | 165 | **Q**: In which country was this photo taken? 166 | 167 | **GT**: Australia 168 | 169 | **Required capabilities**: Recognition, knowledge 170 | 171 | --- 172 | 173 | ![MM-Vet example 9](https://github-production-user-asset-6210df.s3.amazonaws.com/15921929/260642720-34a361ff-2350-494f-a557-4228700097c4.jpg) 174 | 175 | **Q**: Can you explain this meme? 176 | 177 | **GT**: This meme is a humorous take on procrastination and the tendency to delay tasks until a specific time. The person in the meme plans to do something at 8 o'clock, but when they miss that deadline by a few minutes, they decide to wait until 9 o'clock instead. The image of Kermit the Frog lying in bed represents the person's laziness and lack of motivation to complete the task. 178 | 179 | **Required capabilities**: Recognition, OCR, knowledge, language generation 180 | 181 | --- 182 | 183 | ![MM-Vet example 10](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258222130-16635f1c-6e68-4e13-83b1-86c3c5cafd03.png) 184 | 185 | **Q**: The graph below shows the long-term international migration, UK, 1999-2008. 186 | 187 | Summarize the information by selecting and reporting the main features, and make comparisons where relevant. 188 | 189 | You should write at least 150 words. 190 | 191 | **GT**: The chart gives information about UK immigration, emigration and net migration between 1999 and 2008. 192 | 193 | Both immigration and emigration rates rose over the period shown, but the figures for immigration were significantly higher. Net migration peaked in 2004 and 2007. 194 | 195 | In 1999, over 450,000 people came to live in the UK, while the number of people who emigrated stood at just under 300,000. The figure for net migration was around 160,000, and it remained at a similar level until 2003. From 1999 to 2004, the immigration rate rose by nearly 150,000 people, but there was a much smaller rise in emigration. Net migration peaked at almost 250,000 people in 2004. 196 | 197 | After 2004, the rate of immigration remained high, but the number of people emigrating fluctuated. Emigration fell suddenly in 2007, before peaking at about 420,000 people in 2008. As a result, the net migration figure rose to around 240,000 in 2007, but fell back to around 160,000 in 2008. 198 | 199 | **Required capabilities**: Recognition, OCR, language generation, spatial awareness 200 | 201 | --- 202 | 203 | ![MM-Vet example 11](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258222558-65fa56c6-7721-43de-8938-9fba6f931291.jpg) 204 | 205 | **Q**: Which car is on the parking spot 33? 206 | 207 | **GT**: no \ empty 208 | 209 | **Required capabilities**: Recognition, OCR, spatial awareness 210 | 211 | --- 212 | 213 | 214 | ![MM-Vet example 12](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258222983-14a13e90-1907-4944-8617-30a3bb4f5feb.png) 215 | 216 | **Q**: Is this apple organic? 217 | 218 | **GT**: yes 219 | 220 | **Required capabilities**: Recognition, OCR 221 | 222 | --- 223 | 224 | ![MM-Vet example 13](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258223250-6fcde914-1541-45b9-9103-cc88c75a5408.png) 225 | 226 | **Q**: Which are producers in this food web? 227 | 228 | **GT**: Phytoplankton \ Seaweed 229 | 230 | **Required capabilities**: OCR, knowledge, spatial awareness 231 | 232 | --- 233 | 234 | ![MM-Vet example 14](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258223774-a05ddf4f-0d8e-4015-9c61-fd4bf09968f9.png) 235 | 236 | **Q**: Is the person bigger than the car? 237 | 238 | **GT**: no 239 | 240 | **Required capabilities**: Recognition, knowledge, spatial awareness 241 | 242 | --- 243 | 244 | ![MM-Vet example 15](https://github.com/yuweihao/reclor/assets/49296856/75f61c60-d1d2-40c2-87fe-5633971cbe62) 245 | 246 | **Q**: The table below gives information about the underground railway systems in six cities. 247 | 248 | Summarise the information by selecting and reporting the main features, and make comparisons where relevant. 249 | 250 | You should write at least 150 words. 251 | 252 | **GT**: The table shows data about the underground rail networks in six major cities. 253 | 254 | The table compares the six networks in terms of their age, size and the number of people who use them each year. It is clear that the three oldest underground systems are larger and serve significantly more passengers than the newer systems. 255 | 256 | The London underground is the oldest system, having opened in 1863. It is also the largest system, with 394 kilometres of route. The second largest system, in Paris, is only about half the size of the London underground, with 199 kilometres of route. However, it serves more people per year. While only third in terms of size, the Tokyo system is easily the most used, with 1927 million passengers per year. 257 | 258 | Of the three newer networks, the Washington DC underground is the most extensive, with 126 kilometres of route, compared to only 11 kilometres and 28 kilometres for the Kyoto and Los Angeles systems. The Los Angeles network is the newest, having opened in 2001, while the Kyoto network is the smallest and serves only 45 million passengers per year. 259 | 260 | **Required capabilities**: OCR, language generation, spatial awareness 261 | 262 | --- 263 | 264 | ![MM-Vet example 16](https://github-production-user-asset-6210df.s3.amazonaws.com/49296856/258224681-852ab94c-e0da-4dc5-b080-fc3ca6311cfc.png) 265 | 266 | **Q**: What will the girl on the right write on the board? 267 | 268 | **GT**: 14 269 | 270 | **Required capabilities**: Recognition, OCR, spatial awareness, math 271 | 272 | 273 | More samples are shown [here](README_more_samples.md). -------------------------------------------------------------------------------- /README_more_samples.md: -------------------------------------------------------------------------------- 1 | ## More examples in MM-Vet 2 | 3 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_0.png) 4 | 5 | **Q**: What is x in the equation? 6 | 7 | **GT**: -1 \ -5 8 | 9 | **Required capabilities**: OCR, math 10 | 11 | --- 12 | 13 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_1.png) 14 | 15 | **Q**: What is d in the last equation? 16 | 17 | **GT**: 1.25 \ 5/4 18 | 19 | **Required capabilities**: OCR, math 20 | 21 | --- 22 | 23 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_11.jpg) 24 | 25 | **Q**: What is the total price for a bottle of Merlot and a bottle of Cabernet shown in the image? 26 | 27 | **GT**: 249.98 28 | 29 | **Required capabilities**: OCR, spatial awareness, math 30 | 31 | --- 32 | 33 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_13.jpg) 34 | 35 | **Q**: I am getting one funghi pizza and one prosciutto pizza. How much should I pay in total? 36 | 37 | **GT**: 14.8 38 | 39 | **Required capabilities**: OCR, math 40 | 41 | --- 42 | 43 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_23.png) 44 | 45 | **Q**: What is the price for tomatoes? 46 | 47 | **GT**: eight \ 8.0 48 | 49 | **Required capabilities**: recognition, OCR, spatial awareness 50 | 51 | --- 52 | 53 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_27.png) 54 | 55 | **Q**: What earth's interior structure does number 2 indicate? 56 | 57 | **GT**: mantle 58 | 59 | **Required capabilities**: OCR, knowledge, spatial awareness 60 | 61 | --- 62 | 63 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_33.png) 64 | 65 | **Q**: Should I add sugar when mixing egg and milk? 66 | 67 | **GT**: no 68 | 69 | **Required capabilities**: recognition, OCR, spatial awareness 70 | 71 | --- 72 | 73 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_36.jpg) 74 | 75 | **Q**: What should we add in the third step? 76 | 77 | **GT**: milk 78 | 79 | **Required capabilities**: recognition, OCR, spatial awareness 80 | 81 | --- 82 | 83 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_44.png) 84 | 85 | **Q**: What is the difference in metric tonnes between the amount of plastic produced and the amount discarded? 86 | 87 | **GT**: 105 88 | 89 | **Required capabilities**: OCR, spatial awareness, math 90 | 91 | --- 92 | 93 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_46.png) 94 | 95 | **Q**: What is the estimated average standing charge for the year 2023? 96 | 97 | **GT**: 271 98 | 99 | **Required capabilities**: OCR, spatial awareness, math 100 | 101 | --- 102 | 103 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_49.jpg) 104 | 105 | **Q**: What are the appliances in the kitchen in this floorplan? 106 | 107 | **GT**: oven \ dishwasher 108 | 109 | **Required capabilities**: OCR, spatial awareness 110 | 111 | 112 | --- 113 | 114 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_52.png) 115 | 116 | **Q**: What is the solution if the lamp is plugged in and the bulb has burned out? 117 | 118 | **GT**: replace blub 119 | 120 | **Required capabilities**: OCR, spatial awareness 121 | 122 | --- 123 | 124 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_54.png) 125 | 126 | **Q**: What is the average wait time to climb out of bed? 127 | 128 | **GT**: 15 min 129 | 130 | **Required capabilities**: OCR, spatial awareness, math 131 | 132 | --- 133 | 134 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_56.png) 135 | 136 | **Q**: What is Japan's gdp in 2012? 137 | 138 | **GT**: 6,233.15 billion U.S. dollars 139 | 140 | **Required capabilities**: OCR, spatial awareness 141 | 142 | --- 143 | 144 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_57.jpg) 145 | 146 | **Q**: In which years did rowing and athletics have the same number of gold medals? 147 | 148 | **GT**: 2000 \ 2012 149 | 150 | **Required capabilities**: OCR, spatial awareness 151 | 152 | --- 153 | 154 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_58.png) 155 | 156 | **Q**: Which department is the person who has the highest salary from? 157 | 158 | **GT**: Administration 159 | 160 | **Required capabilities**: OCR, spatial awareness 161 | 162 | --- 163 | 164 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_62.png) 165 | 166 | **Q**: What percentage does salary contribute to total income? 167 | 168 | **GT**: 56.2 169 | 170 | **Required capabilities**: OCR, spatial awareness, math 171 | 172 | --- 173 | 174 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_64.png) 175 | 176 | **Q**: Who is the person on the left? 177 | 178 | **GT**: Keira Knightley 179 | 180 | **Required capabilities**: recognition, spatial awareness 181 | 182 | --- 183 | 184 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_67.png) 185 | 186 | **Q**: Which one is unsweetened? 187 | 188 | **GT**: oat 189 | 190 | **Required capabilities**: OCR, spatial awareness 191 | 192 | --- 193 | 194 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_69.jpg) 195 | 196 | **Q**: What are the calories for the entire bag? 197 | 198 | **GT**: 275 199 | 200 | **Required capabilities**: OCR, math 201 | 202 | --- 203 | 204 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_72.png) 205 | 206 | **Q**: What is the name of this dish? 207 | 208 | **GT**: pad thai 209 | 210 | **Required capabilities**: recognition 211 | 212 | --- 213 | 214 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_73.jpg) 215 | 216 | **Q**: What is the make of the car on the left? 217 | 218 | **GT**: volkswagen 219 | 220 | **Required capabilities**: recognition, OCR, spatial awareness 221 | 222 | --- 223 | 224 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_78.jpg) 225 | 226 | **Q**: What is the name of this landmark? 227 | 228 | **GT**: baochu pagoda 229 | 230 | **Required capabilities**: recognition 231 | 232 | 233 | --- 234 | 235 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_79.jpg) 236 | 237 | **Q**: What is the face value of this banknote? 238 | 239 | **GT**: five 240 | 241 | **Required capabilities**: recognition, knowledge 242 | 243 | 244 | --- 245 | 246 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_81.jpg) 247 | 248 | **Q**: What type of currency does this banknote represent? 249 | 250 | **GT**: Swede \ Kronor 251 | 252 | **Required capabilities**: recognition 253 | 254 | --- 255 | 256 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_82.png) 257 | 258 | **Q**: Which chest xray disease can be found? (Options include: Atelectasis; Cardiomegaly; Effusion; Infiltration; Mass; Nodule; Pneumonia; Pneumothorax; Consolidation; Edema; Emphysema; Fibrosis; Pleural Thickening; Hernia.) 259 | 260 | **GT**: Cardiomegaly 261 | 262 | **Required capabilities**: recognition, knowledge 263 | 264 | --- 265 | 266 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_87.jpg) 267 | 268 | **Q**: How many books are there? 269 | 270 | **GT**: 63 271 | 272 | **Required capabilities**: recognition 273 | 274 | --- 275 | 276 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_92.jpg) 277 | 278 | **Q**: What is all the scene text in the image? 279 | 280 | **GT**: MR. WILLIAM SHAKESPEARES \ COMEDIES, HISTORIES, & TRAGEDIES.\ publifhed according to the True Originall Copies. 281 | 282 | **Required capabilities**: OCR 283 | 284 | --- 285 | 286 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_94.jpg) 287 | 288 | **Q**: What is the brand of this device? 289 | 290 | **GT**: MOTOROLA 291 | 292 | **Required capabilities**: OCR 293 | 294 | --- 295 | 296 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_96.jpg) 297 | 298 | **Q**: What are all the French scene text in the image? 299 | 300 | **GT**: HISTOIRE DE FRANCE \ De La préhistoire à ans jours \ Queelle Histoire 301 | 302 | **Required capabilities**: OCR 303 | 304 | --- 305 | 306 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_98.jpg) 307 | 308 | **Q**: Which continent is highlighted? 309 | 310 | **GT**: Asia 311 | 312 | **Required capabilities**: recognition, knowledge 313 | 314 | --- 315 | 316 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_104.png) 317 | 318 | **Q**: Where is this photo taken? 319 | 320 | **GT**: qatar 321 | 322 | **Required capabilities**: recognition, knowledge 323 | 324 | --- 325 | 326 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_106.jpg) 327 | 328 | **Q**: Are the trees taller than the giraffes? 329 | 330 | **GT**: no 331 | 332 | **Required capabilities**: recognition 333 | 334 | --- 335 | 336 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_107.jpg) 337 | 338 | **Q**: Is this airplane taking off or landing? 339 | 340 | **GT**: taking off 341 | 342 | **Required capabilities**: recognition 343 | 344 | --- 345 | 346 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_110.jpg) 347 | 348 | **Q**: Does the giraffe appear dangerous? 349 | 350 | **GT**: no 351 | 352 | **Required capabilities**: recognition 353 | 354 | --- 355 | 356 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_111.jpg) 357 | 358 | **Q**: Is there any reflection of zebra in water? 359 | 360 | **GT**: yes 361 | 362 | **Required capabilities**: recognition 363 | 364 | --- 365 | 366 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_113.jpg) 367 | 368 | **Q**: Are all of the cats the same color? 369 | 370 | **GT**: no 371 | 372 | **Required capabilities**: recognition 373 | 374 | 375 | --- 376 | 377 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_124.jpg) 378 | 379 | **Q**: Are there napkins under the utensil to the left of the rice? 380 | 381 | **GT**: yes 382 | 383 | **Required capabilities**: recognition, spatial awareness 384 | 385 | --- 386 | 387 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_131.jpg) 388 | 389 | **Q**: Is the surfer to the left or to the right of the surfboard? 390 | 391 | **GT**: left 392 | 393 | **Required capabilities**: recognition, spatial awareness 394 | 395 | --- 396 | 397 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_139.jpg) 398 | 399 | **Q**: Why are the persons wearing black clothes? Answer it and give the rationale. 400 | 401 | **GT**: They are going to a funeral. I think so because it is traditional for people to wear black to a funeral. 402 | 403 | **Required capabilities**: recognition, knowledge, language generation 404 | 405 | --- 406 | 407 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_143.jpg) 408 | 409 | **Q**: Is the woman in red going to be mad at the man, for looking at the woman in dress? Answer it and give the rationale. 410 | 411 | **GT**: Yes, the woman in red and the man appear to be a couple and the woman in red would not appreciate the man checking out other women. I think so because in a monogamous relationship, partners are expected to be faithful to one another. 412 | 413 | **Required capabilities**: recognition, knowledge, language generation 414 | 415 | 416 | --- 417 | 418 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_149.jpg) 419 | 420 | **Q**: Can you explain this meme? 421 | 422 | **GT**: This meme is a commentary on the current situation of online learning due to the COVID-19 pandemic. It shows how different parties are affected by it.

The first two images show the Zoom company and the teachers throwing money in the air, which implies that they are profiting from the situation. Zoom, in particular, has seen a surge in usage and revenue due to the pandemic.

The third image shows the WiFi company also throwing money in the air, which implies that they are also profiting from the situation as more people are using the internet for online learning.

The last image shows a student with a sad clown face, which implies that students are not benefiting from the situation and are struggling with online learning. The \"Put on a happy face\" sign in the background further emphasizes the idea that students are putting on a brave face despite their struggles. 423 | 424 | **Required capabilities**: recognition, OCR, knowledge, language generation 425 | 426 | --- 427 | 428 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_153.jpg) 429 | 430 | **Q**: What is funny about this image? 431 | 432 | **GT**: It is a cartoon of a rhinoceros painting a picture and each picture has its rhino horn because the rhino horn grows in front of its eyes. The caption \"You see the world as you are!\" is a playful commentary on how our perspective shapes our perception of the world. 433 | 434 | **Required capabilities**: recognition, OCR, knowledge, language generation 435 | 436 | 437 | --- 438 | 439 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_158.jpg) 440 | 441 | **Q**: How to cook this dish? 442 | 443 | **GT**: The dish is called \"Hungarian Mushroom Soup\". This Hungarian mushroom soup has lots of flavor and is fairly quick to make. It's primarily a mushroom soup but derives a lot of its flavor from other ingredients. My family loves soup and this is one of their favorites.


Prep Time: 15 mins
Cook Time: 35 mins
Total Time: 50 mins
Servings: 6


Ingredients

4 tablespoons unsalted butter
2 cups chopped onions
1 pound fresh mushrooms, sliced
2 cups chicken broth
1 tablespoon soy sauce
1 tablespoon paprika
2 teaspoons dried dill weed
1 cup milk
3 tablespoons all-purpose flour
½ cup sour cream
¼ cup chopped fresh parsley
2 teaspoons lemon juice
1 teaspoon salt
ground black pepper to taste


Directions

Step1
Melt butter in a large pot over medium heat. Add onions; cook and stir until softened, about 5 minutes. Add mushrooms and sauté for 5 more minutes. Stir in broth, soy sauce, paprika, and dill; reduce heat to low, cover, and simmer for 15 minutes.

Step2
Whisk milk and flour together in a separate bowl; stir into soup until blended. Cover and simmer for 15 more minutes, stirring occasionally.

Step3
Add sour cream, parsley, lemon juice, salt, and ground black pepper; stir over low heat until warmed through, about 3 to 5 minutes. Serve immediately. 444 | 445 | **Required capabilities**: recognition, knowledge, language generation 446 | 447 | --- 448 | 449 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_159.jpg) 450 | 451 | **Q**: How to make this? 452 | 453 | **GT**: This is called \"Sparkling Cantaloupe Agua Fresca\". This refreshing agua fresca with cantaloupe and lime juice topped with club soda is perfect on a hot day.

Prep Time: 15 mins
Total Time: 15 mins
Servings: 8

Ingredients

2 pounds fresh cantaloupe cubes (6 cups)
2 cups water
6 tablespoons freshly squeezed lime juice (from 2 limes)
3 tablespoons honey
1/4 teaspoon salt
2 cups club soda, chilled

cantaloupe melon balls or cubes, for garnish
lime zest twists, for garnish


Directions

Step 1
Working in batches if needed, blend cantaloupe, water, lime juice, honey, and salt in a blender until smooth. Press through a sieve; discard solids. Add strained juice to a pitcher.

Step 2
Chill until ready to serve. Add club soda just before serving. Serve over ice, and garnish glasses with melon balls and lime zest twists. 454 | 455 | **Required capabilities**: recognition, knowledge, language generation 456 | 457 | --- 458 | 459 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_168.jpg) 460 | 461 | **Q**: Can you give a short introduction to this movie? 462 | 463 | **GT**: The Shawshank Redemption is a 1994 American drama film written and directed by Frank Darabont, based on the 1982 Stephen King novella Rita Hayworth and Shawshank Redemption. The film tells the story of banker Andy Dufresne (Tim Robbins), who is sentenced to life in Shawshank State Penitentiary for the murders of his wife and her lover, despite his claims of innocence. Over the following two decades, he befriends a fellow prisoner, contraband smuggler Ellis \"Red\" Redding (Morgan Freeman), and becomes instrumental in a money laundering operation led by the prison warden Samuel Norton (Bob Gunton). William Sadler, Clancy Brown, Gil Bellows, and James Whitmore appear in supporting roles. 464 | 465 | **Required capabilities**: recognition, knowledge, language generation 466 | 467 | 468 | ![MM-Vet example 49](mm-vet/images/v1_183.jpg) 469 | 470 | **Q**: Can you give a short introduction to this painting? 471 | 472 | **GT**: Arrangement in Grey and Black No. 1, best known under its colloquial name Whistler's Mother or Portrait of Artist's Mother, is a painting in oils on canvas created by the American-born painter James Abbott McNeill Whistler in 1871. The subject of the painting is Whistler's mother, Anna McNeill Whistler. The painting is 56.81 by 63.94 inches (1,443 mm × 1,624 mm), displayed in a frame of Whistler's own design. It is held by the Musée d'Orsay in Paris, having been bought by the French state in 1891. It is one of the most famous works by an American artist outside the United States. It has been variously described as an American icon and a Victorian Mona Lisa. 473 | 474 | **Required capabilities**: recognition, knowledge, language generation 475 | 476 | --- 477 | 478 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_193.jpg) 479 | 480 | **Q**: What is this disease (give short introduction) and how to treat it? 481 | 482 | **GT**: It is club root. Affecting most brassica crops (cabbage, broccoli, cauliflower, etc.), club root is a serious plant disease in North American home gardens. It is caused by the soil-borne fungus Plasmodiophora brassicae which infects susceptible plants through root hairs. Diseased roots become swollen, misshapen and deformed (clubbed) often cracking and rotting. As a result, plants have difficulty absorbing water and nutrients properly.

Plants often grow poorly and wilt during the heat of the day; plants often revive during cool nights. Outer leaves may turn yellow, purple or brown. Club root will reduce yields and can cause total crop failure.

Fungal spores can be spread by wind, water and garden tools. Disease development can occur over a wide range of conditions, but is favored by excessive moisture, low soil pH and soil temperatures between 64 and 77˚F. Spores can survive in the soil for as many as 10 years.

Treatment
1. Fungicides will NOT treat this soil-dwelling micro-organism.
2. Choose resistant cultivars when possible.
3. Try to prevent the occurrence of this disease by keeping a clean garden and rotating crops.
4. Keep in mind that the disease spores can persist in the soil for up to 20 years. If club root is present you may want to solarize the soil.*
5. Control susceptible weeds \u2014 mustard, radish, shepherd\u2019s purse \u2014 that may be infected to reduce potential buildup of the disease.
6. Carefully remove infected plants and sterilize garden tools (one part bleach to 4 parts water) after use.
7. Raise your soil\u2019s pH to a more alkaline 7.2 by mixing oyster shell or dolomite lime into your garden in the fall. Simple and affordable soil test kits are available to check pH often. 483 | 484 | **Required capabilities**: recognition, knowledge, language generation 485 | 486 | --- 487 | 488 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_205.jpg) 489 | 490 | **Q**: What is this disease (give short introduction) and how to treat it? 491 | 492 | **GT**: Joshua Abraham Norton (February 4, 1818 – January 8, 1880), known as Emperor Norton, was a resident of San Francisco, California, who in 1859 proclaimed himself \"Norton I., Emperor of the United States\". In 1863, after Napoleon III invaded Mexico, he took the secondary title of \"Protector of Mexico\". 493 | 494 | **Required capabilities**: recognition, knowledge, language generation 495 | 496 | --- 497 | 498 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_209.png) 499 | 500 | **Q**: The graph and table below give information about water use worldwide and water consumption in two different countries.

Summarise the information by selecting and reporting the main features, and make comparisons where relevant.

You should write at least 150 words. 501 | 502 | **GT**: The charts compare the amount of water used for agriculture, industry and homes around the world, and water use in Brazil and the Democratic Republic of Congo.

It is clear that global water needs rose significantly between 1900 and 2000, and that agriculture accounted for the largest proportion of water used. We can also see that water consumption was considerably higher in Brazil than in the Congo.

In 1900, around 500km³ of water was used by the agriculture sector worldwide. The figures for industrial and domestic water consumption stood at around one fifth of that amount. By 2000, global water use for agriculture had increased to around 3000km³, industrial water use had risen to just under half that amount, and domestic consumption had reached approximately 500km³.

In the year 2000, the populations of Brazil and the Congo were 176 million and 5.2 million respectively. Water consumption per person in Brazil, at 359m³, was much higher than that in the Congo, at only 8m³, and this could be explained by the fact that Brazil had 265 times more irrigated land. 503 | 504 | **Required capabilities**: recognition, OCR, language generation, spatial awareness 505 | 506 | --- 507 | 508 | ![MM-Vet example](https://raw.githubusercontent.com/yuweihao/MM-Vet_data/main/images/v1_215.jpg) 509 | 510 | **Q**: Chorleywood is a village near London whose population has increased steadily since the middle of the nineteenth century. The map below shows the development of the village.

Write a report for a university lecturer describing the development of the village.

You should write at least 150 words. 511 | 512 | **GT**: The map shows the growth of a village called Chorleywood between 1868 and 1994.

It is clear that the village grew as the transport infrastructure was improved. Four periods of development are shown on the map, and each of the populated areas is near to the main roads, the railway or the motorway.

From 1868 to 1883, Chorleywood covered a small area next to one of the main roads. Chorleywood Park and Golf Course is now located next to this original village area. The village grew along the main road to the south between 1883 and 1922, and in 1909 a railway line was built crossing this area from west to east. Chorleywood station is in this part of the village.

The expansion of Chorleywood continued to the east and west alongside the railway line until 1970. At that time, a motorway was built to the east of the village, and from 1970 to 1994, further development of the village took place around motorway intersections with the railway and one of the main roads. 513 | 514 | **Required capabilities**: recognition, OCR, language generation, spatial awareness -------------------------------------------------------------------------------- /inference/bard.py: -------------------------------------------------------------------------------- 1 | """ 2 | Install bardapi by 3 | ```bash 4 | pip install bardapi 5 | ``` 6 | 7 | Run this script by 8 | ```bash 9 | while true; do python bard.py; sleep 60; done 10 | ``` 11 | Currently we have to use loop in bash instead of python because the Bard-API seems have a bug. 12 | 13 | Remember to change your image folder and meta data path in this script. 14 | """ 15 | 16 | import pandas as pd 17 | import os 18 | import json 19 | import time 20 | from bardapi import Bard 21 | 22 | 23 | BARD_TOKEN = "YOUR_TOKEN_HERE" # https://github.com/dsdanielpark/Bard-API#authentication 24 | 25 | model_name = "bard" 26 | bard_error = "Temporarily unavailable due to traffic or an error in cookie values." 27 | 28 | # change the path to your own path 29 | results_path = f'../results/{model_name}.json' # path to save the results 30 | image_folder = f"/path/to/mm-vet/images" 31 | meta_data = "/path/to/mm-vet/mm-vet.json" 32 | 33 | 34 | with open(meta_data, 'r') as f: 35 | data = json.load(f) 36 | 37 | 38 | if os.path.exists(results_path): 39 | with open(results_path, 'r') as f: 40 | results = json.load(f) 41 | else: 42 | results = {} 43 | 44 | data_num = len(data) 45 | 46 | # time.sleep(60) 47 | for i in range(len(data)): 48 | id = f"v1_{i}" 49 | if id in results and not (bard_error in results[id]): 50 | continue 51 | # time.sleep(60) 52 | imagename = data[id]['imagename'] 53 | img_path = os.path.join(image_folder, imagename) 54 | prompt = data[id]['question'] 55 | prompt = prompt.strip() 56 | print(f"\nPrompt: {prompt}") 57 | # load sample image 58 | bard = Bard(token=BARD_TOKEN) 59 | image = open(img_path, 'rb').read() # (jpeg, png, webp) are supported. 60 | bard_answer = bard.ask_about_image(prompt, image) 61 | response = bard_answer['content'] 62 | if bard_error in response: 63 | time.sleep(60) 64 | break 65 | 66 | print(f"Response: {response}") 67 | results[id] = response 68 | with open(results_path, 'w') as f: 69 | json.dump(results, f, indent=4) 70 | break 71 | -------------------------------------------------------------------------------- /inference/bard.sh: -------------------------------------------------------------------------------- 1 | while true; do python bard.py; sleep 60; done 2 | -------------------------------------------------------------------------------- /inference/claude.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python claude.py --mmvet_path /path/to/mm-vet --anthropic_api_key 4 | """ 5 | import os 6 | import argparse 7 | import anthropic 8 | from utils import evaluate_on_mmvet, encode_image 9 | 10 | 11 | class Claude: 12 | def __init__(self, api_key, 13 | model="claude-3-opus-20240229", temperature=0.0, 14 | max_tokens=512, system=None): 15 | self.model = model 16 | self.client = anthropic.Anthropic( 17 | api_key=api_key, 18 | ) 19 | self.system = system 20 | self.temperature = temperature 21 | self.max_tokens = max_tokens 22 | 23 | def get_response(self, image_path, prompt="What's in this image?"): 24 | base64_image = encode_image(image_path) 25 | image_format = "png" if image_path.endswith('.png') else "jpeg" 26 | 27 | messages = [] 28 | content = [ 29 | { 30 | "type": "text", 31 | "text": prompt, 32 | }, 33 | { 34 | "type": "image", 35 | "source": { 36 | "type": "base64", 37 | "media_type": f"image/{image_format}", 38 | "data": base64_image, 39 | } 40 | } 41 | 42 | ] 43 | 44 | messages.append({ 45 | "role": "user", 46 | "content": content, 47 | }) 48 | 49 | payload = { 50 | "model": self.model, 51 | "messages": messages, 52 | "max_tokens": self.max_tokens, 53 | "temperature": self.temperature, 54 | } 55 | 56 | if self.system: 57 | payload["system"] = self.system 58 | 59 | response = self.client.messages.create(**payload) 60 | response_text = response.content[0].text 61 | return response_text.strip() 62 | 63 | 64 | def arg_parser(): 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument( 67 | "--mmvet_path", 68 | type=str, 69 | default="/path/to/mm-vet", 70 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 71 | ) 72 | parser.add_argument( 73 | "--result_path", 74 | type=str, 75 | default="results", 76 | ) 77 | parser.add_argument( 78 | "--anthropic_api_key", type=str, default=None, 79 | help="refer to https://docs.anthropic.com/claude/reference/getting-started-with-the-api" 80 | ) 81 | parser.add_argument( 82 | "--model_name", 83 | type=str, 84 | default="claude-3-opus-20240229", 85 | help="Claude model name", 86 | ) 87 | args = parser.parse_args() 88 | return args 89 | 90 | 91 | if __name__ == "__main__": 92 | args = arg_parser() 93 | 94 | # prepare the model 95 | if args.anthropic_api_key: 96 | ANTHROPIC_API_KEY = args.anthropic_api_key 97 | else: 98 | ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY') 99 | 100 | if ANTHROPIC_API_KEY is None: 101 | raise ValueError("Please set the ANTHROPIC_API_KEY environment variable or pass it as an argument") 102 | 103 | model = Claude(ANTHROPIC_API_KEY, model=args.model_name) 104 | 105 | # evalute on mm-vet 106 | evaluate_on_mmvet(args, model) 107 | -------------------------------------------------------------------------------- /inference/gemini_vision.py: -------------------------------------------------------------------------------- 1 | """ 2 | Please refer to https://ai.google.dev/tutorials/python_quickstart to get the API key 3 | 4 | Install with `pip install -q -U google-generativeai`, 5 | Then `python gemini_vision.py --mmvet_path /path/to/mm-vet --google_api_key YOUR_API_KEY` 6 | """ 7 | 8 | import os 9 | import time 10 | from pathlib import Path 11 | import google.generativeai as genai 12 | import argparse 13 | from utils import evaluate_on_mmvet 14 | 15 | class Gemini: 16 | def __init__(self, model="gemini-pro-vision"): 17 | self.model = genai.GenerativeModel(model) 18 | 19 | def get_response(self, image_path, prompt) -> str: 20 | # Query the model 21 | text = "" 22 | while len(text) < 1: 23 | try: 24 | image_path = Path(image_path) 25 | image = { 26 | 'mime_type': f'image/{image_path.suffix[1:].replace("jpg", "jpeg")}', 27 | 'data': image_path.read_bytes() 28 | } 29 | response = self.model.generate_content( 30 | [ 31 | # Add an example image 32 | image, 33 | # Add an example query 34 | prompt, 35 | ] 36 | ) 37 | try: 38 | text = response._result.candidates[0].content.parts[0].text 39 | except: 40 | text = " " 41 | except Exception as error: 42 | print(error) 43 | sleep_time = 30 44 | print(f'Sleeping for {sleep_time} seconds') 45 | time.sleep(sleep_time) 46 | return text.strip() 47 | 48 | 49 | def arg_parser(): 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument( 52 | "--mmvet_path", 53 | type=str, 54 | default="/path/to/mm-vet", 55 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 56 | ) 57 | parser.add_argument( 58 | "--result_path", 59 | type=str, 60 | default="results", 61 | ) 62 | parser.add_argument( 63 | "--google_api_key", type=str, default=None, 64 | help="refer to https://ai.google.dev/tutorials/python_quickstart" 65 | ) 66 | parser.add_argument( 67 | "--model_name", 68 | type=str, 69 | default="gemini-pro-vision", 70 | help="Gemini model name", 71 | ) 72 | args = parser.parse_args() 73 | return args 74 | 75 | 76 | if __name__ == "__main__": 77 | args = arg_parser() 78 | 79 | # prepare the model 80 | if args.google_api_key: 81 | GOOGLE_API_KEY = args.google_api_key 82 | else: 83 | GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY') 84 | 85 | if GOOGLE_API_KEY is None: 86 | raise ValueError("Please set the GOOGLE_API_KEY environment variable or pass it as an argument") 87 | 88 | genai.configure(api_key=GOOGLE_API_KEY) 89 | model = Gemini(model=args.model_name) 90 | 91 | # evaluate on mm-vet 92 | evaluate_on_mmvet(args, model) -------------------------------------------------------------------------------- /inference/gpt4v.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python gpt4v.py --mmvet_path /path/to/mm-vet --openai_api_key 4 | """ 5 | 6 | import time 7 | import os 8 | import requests 9 | import argparse 10 | from utils import evaluate_on_mmvet, encode_image 11 | 12 | 13 | class GPT4V: 14 | def __init__(self, api_key, model="gpt-4-vision-preview", image_detail="auto", 15 | system_text="You are a helpful assistant. Generate a short and concise response to the following image text pair."): 16 | self.api_key = api_key 17 | self.model = model 18 | self.image_detail = image_detail 19 | self.system_text = system_text 20 | self.headers = { 21 | "Content-Type": "application/json", 22 | "Authorization": f"Bearer {self.api_key}" 23 | } 24 | self.url = "https://api.openai.com/v1/chat/completions" 25 | 26 | def get_response(self, image_path, prompt="What's in this image?"): 27 | base64_image = encode_image(image_path) 28 | image_format = "data:image/png;base64" if 'png' in image_path else "data:image/jpeg;base64" 29 | messages = [] 30 | if self.system_text is not None or self.system_text != "": 31 | messages.append({ 32 | "role": "system", 33 | "content": [ 34 | { 35 | "type": "text", 36 | "text": self.system_text, 37 | }, 38 | ] 39 | }) 40 | messages.append({ 41 | "role": "user", 42 | "content": [ 43 | { 44 | "type": "text", 45 | "text": prompt 46 | }, 47 | { 48 | "type": "image_url", 49 | "image_url": { 50 | "url": f"{image_format},{base64_image}", 51 | "detail": self.image_detail, 52 | } 53 | } 54 | ] 55 | }) 56 | 57 | payload = { 58 | "model": self.model, 59 | "messages": messages, 60 | "max_tokens": 300, 61 | } 62 | 63 | response_text, retry, response_json, regular_time = '', 0, None, 30 64 | while len(response_text) < 1: 65 | retry += 1 66 | time.sleep(1) 67 | try: 68 | response = requests.post(self.url, headers=self.headers, json=payload) 69 | response_json = response.json() 70 | # print(response_json) 71 | except Exception as e: 72 | print(e) 73 | time.sleep(regular_time) 74 | continue 75 | if response.status_code != 200: 76 | print(response.headers,response.content) 77 | print(image_path) 78 | print(f"The response status code for is {response.status_code} (Not OK)") 79 | time.sleep(regular_time) 80 | continue 81 | if 'choices' not in response_json: 82 | time.sleep(regular_time) 83 | continue 84 | response_text = response_json["choices"][0]["message"]["content"] 85 | return response_text.strip() 86 | 87 | 88 | def arg_parser(): 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument( 91 | "--mmvet_path", 92 | type=str, 93 | default="/path/to/mm-vet", 94 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 95 | ) 96 | parser.add_argument( 97 | "--result_path", 98 | type=str, 99 | default="results", 100 | ) 101 | parser.add_argument( 102 | "--openai_api_key", type=str, default=None, 103 | help="refer to https://platform.openai.com/docs/quickstart?context=python" 104 | ) 105 | parser.add_argument( 106 | "--model_name", 107 | type=str, 108 | default="gpt-4-vision-preview", 109 | help="GPT model name", 110 | ) 111 | parser.add_argument( 112 | "--image_detail", 113 | type=str, 114 | default="auto", 115 | help="Refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding", 116 | ) 117 | args = parser.parse_args() 118 | return args 119 | 120 | 121 | if __name__ == "__main__": 122 | args = arg_parser() 123 | 124 | # prepare the model 125 | if args.openai_api_key: 126 | OPENAI_API_KEY = args.openai_api_key 127 | else: 128 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 129 | 130 | if OPENAI_API_KEY is None: 131 | raise ValueError("Please set the OPENAI_API_KEY environment variable or pass it as an argument") 132 | 133 | model = GPT4V(OPENAI_API_KEY, model=args.model_name, image_detail=args.image_detail) 134 | args.model_name = f"{args.model_name}_detail-{args.image_detail}" 135 | 136 | # evaluate on mm-vet 137 | evaluate_on_mmvet(args, model) -------------------------------------------------------------------------------- /inference/qwen.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python qwen.py --mmvet_path /path/to/mm-vet --dashscope_api_key 4 | """ 5 | import os 6 | import argparse 7 | from utils import evaluate_on_mmvet 8 | from http import HTTPStatus 9 | import dashscope 10 | 11 | class Qwen: 12 | def __init__(self, model='qwen-vl-max'): 13 | self.model = model 14 | 15 | def get_response(self, image_path, prompt="What's in this image?"): 16 | messages = [] 17 | content = [ 18 | { 19 | "text": prompt, 20 | }, 21 | { 22 | "image": f"file://{image_path}" 23 | } 24 | ] 25 | 26 | messages.append({ 27 | "role": "user", 28 | "content": content, 29 | }) 30 | 31 | payload = { 32 | "model": self.model, 33 | "messages": messages, 34 | } 35 | 36 | response = dashscope.MultiModalConversation.call(**payload) 37 | if response.status_code == HTTPStatus.OK: 38 | rps = response['output']['choices'][0]['message']['content'] 39 | for rp in rps: 40 | if 'text' in rp: 41 | response_text = rp['text'] 42 | return response_text.strip() 43 | else: 44 | print(response.code) # The error code. 45 | print(response.message) # The error message. 46 | return "" 47 | 48 | 49 | def arg_parser(): 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument( 52 | "--mmvet_path", 53 | type=str, 54 | default="/path/to/mm-vet", 55 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 56 | ) 57 | parser.add_argument( 58 | "--result_path", 59 | type=str, 60 | default="results", 61 | ) 62 | parser.add_argument( 63 | "--dashscope_api_key", type=str, default=None, 64 | help="refer to https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start" 65 | ) 66 | parser.add_argument( 67 | "--model_name", 68 | type=str, 69 | default="qwen-vl-max", 70 | help="Qwen model name", 71 | ) 72 | args = parser.parse_args() 73 | return args 74 | 75 | 76 | if __name__ == "__main__": 77 | args = arg_parser() 78 | 79 | # prepare the model 80 | if args.dashscope_api_key: 81 | DASHSCOPE_API_KEY = args.dashscope_api_key 82 | else: 83 | DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY') 84 | 85 | if DASHSCOPE_API_KEY is None: 86 | raise ValueError("Please set the DASHSCOPE_API_KEY environment variable or pass it as an argument") 87 | 88 | dashscope.api_key = DASHSCOPE_API_KEY 89 | model = Qwen(model=args.model_name) 90 | 91 | # evalute on mm-vet 92 | evaluate_on_mmvet(args, model) 93 | -------------------------------------------------------------------------------- /inference/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import base64 4 | 5 | 6 | # Function to encode the image 7 | def encode_image(image_path): 8 | with open(image_path, "rb") as image_file: 9 | return base64.b64encode(image_file.read()).decode('utf-8') 10 | 11 | 12 | def evaluate_on_mmvet(args, model): 13 | if os.path.exists(args.result_path) is False: 14 | os.makedirs(args.result_path) 15 | results_path = os.path.join(args.result_path, f"{args.model_name}.json") 16 | image_folder = os.path.join(args.mmvet_path, "images") 17 | meta_data = os.path.join(args.mmvet_path, "mm-vet.json") 18 | 19 | with open(meta_data, 'r') as f: 20 | data = json.load(f) 21 | 22 | if os.path.exists(results_path): 23 | with open(results_path, 'r') as f: 24 | results = json.load(f) 25 | else: 26 | results = {} 27 | 28 | for i in range(len(data)): 29 | id = f"v1_{i}" 30 | if id in results: 31 | continue 32 | imagename = data[id]['imagename'] 33 | img_path = os.path.join(image_folder, imagename) 34 | prompt = data[id]['question'] 35 | print(f"\n{id}") 36 | print(f"Image: {imagename}") 37 | print(f"Prompt: {prompt}") 38 | response = model.get_response(img_path, prompt) 39 | print(f"Response: {response}") 40 | results[id] = response 41 | with open(results_path, 'w') as f: 42 | json.dump(results, f, indent=4) -------------------------------------------------------------------------------- /mm-vet_evaluator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from openai import OpenAI\n", 10 | "import json\n", 11 | "import os\n", 12 | "from tqdm import tqdm\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "from collections import Counter\n", 16 | "import time\n", 17 | "import pathlib\n", 18 | "client = OpenAI(\n", 19 | " # This is the default and can be omitted\n", 20 | " api_key=os.environ.get(\"OPENAI_API_KEY\"),\n", 21 | ")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "gpt_model = \"gpt-4-0613\"\n", 31 | "\n", 32 | "\n", 33 | "prompt = \"\"\"Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score.\n", 34 | "\n", 35 | "Question | Ground truth | Prediction | Correctness\n", 36 | "--- | --- | --- | ---\n", 37 | "What is x in the equation? | -1 -5 | x = 3 | 0.0\n", 38 | "What is x in the equation? | -1 -5 | x = -1 | 0.5\n", 39 | "What is x in the equation? | -1 -5 | x = -5 | 0.5\n", 40 | "What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5\n", 41 | "What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0\n", 42 | "Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4\n", 43 | "Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0\n", 44 | "\"\"\"" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# load metadata\n", 54 | "# Download mm-vet.zip and `unzip mm-vet.zip` and change the path below\n", 55 | "mmvet_path = \"/path/to/mm-vet\"\n", 56 | "use_sub_set = False\n", 57 | "decimal_places = 1 # number of decimal places to round to\n", 58 | "\n", 59 | "\n", 60 | "if use_sub_set:\n", 61 | " bard_set_file = os.path.join(mmvet_path, \"bard_set.json\")\n", 62 | " with open(bard_set_file, 'r') as f:\n", 63 | " sub_set = json.load(f)\n", 64 | " sub_set_name = 'bardset'\n", 65 | " sub_set_name = sub_set_name + '_'\n", 66 | "else:\n", 67 | " sub_set = None\n", 68 | " sub_set_name = ''\n", 69 | "\n", 70 | "mmvet_metadata = os.path.join(mmvet_path, \"mm-vet.json\")\n", 71 | "with open(mmvet_metadata, 'r') as f:\n", 72 | " data = json.load(f)\n", 73 | "\n", 74 | "\n", 75 | "counter = Counter()\n", 76 | "cap_set_list = []\n", 77 | "cap_set_counter = []\n", 78 | "len_data = 0\n", 79 | "for id, value in data.items():\n", 80 | " if sub_set is not None and id not in sub_set:\n", 81 | " continue\n", 82 | " question = value[\"question\"]\n", 83 | " answer = value[\"answer\"]\n", 84 | " cap = value[\"capability\"]\n", 85 | " cap = set(cap)\n", 86 | " counter.update(cap)\n", 87 | " if cap not in cap_set_list:\n", 88 | " cap_set_list.append(cap)\n", 89 | " cap_set_counter.append(1)\n", 90 | " else:\n", 91 | " cap_set_counter[cap_set_list.index(cap)] += 1\n", 92 | " \n", 93 | " len_data += 1\n", 94 | "\n", 95 | "sorted_list = counter.most_common()\n", 96 | "columns = [k for k, v in sorted_list]\n", 97 | "columns.append(\"total\")\n", 98 | "columns.append(\"std\")\n", 99 | "columns.append('runs')\n", 100 | "df = pd.DataFrame(columns=columns)\n", 101 | "\n", 102 | "\n", 103 | "cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter))\n", 104 | "new_cap_set_list = []\n", 105 | "new_cap_set_counter = []\n", 106 | "for index in cap_set_sorted_indices:\n", 107 | " new_cap_set_list.append(cap_set_list[index])\n", 108 | " new_cap_set_counter.append(cap_set_counter[index])\n", 109 | "\n", 110 | "cap_set_list = new_cap_set_list\n", 111 | "cap_set_counter = new_cap_set_counter\n", 112 | "cap_set_names = [\"_\".join(list(cap_set)) for cap_set in cap_set_list]\n", 113 | "\n", 114 | "columns2 = cap_set_names\n", 115 | "columns2.append(\"total\")\n", 116 | "columns2.append(\"std\")\n", 117 | "columns2.append('runs')\n", 118 | "df2 = pd.DataFrame(columns=columns2)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "result_file = \"results/llava_llama2_13b_chat.json\" # change your model result_file\n", 128 | "result_path = \"results\" # path to save grading results\n", 129 | "num_run = 1 # we set it as 5 in the paper\n", 130 | "\n", 131 | "if os.path.exists(result_file) is False:\n", 132 | " raise ValueError(\"Result file does not exist\")\n", 133 | "if not result_file.endswith(('.json', '.JSON')):\n", 134 | " raise ValueError(\"Result file should be a json file\")\n", 135 | "model = pathlib.Path(result_file).stem\n", 136 | "# grade results for each sample to svae\n", 137 | "grade_file = f'{model}_{gpt_model}-grade-{num_run}runs.json'\n", 138 | "grade_file = os.path.join(result_path, grade_file)\n", 139 | "\n", 140 | "# score results regarding capabilities/capability integration to save\n", 141 | "cap_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-score-{num_run}runs.csv'\n", 142 | "cap_score_file = os.path.join(result_path, cap_score_file)\n", 143 | "cap_int_score_file = f'{model}_{sub_set_name}{gpt_model}-cap-int-score-{num_run}runs.csv'\n", 144 | "cap_int_score_file = os.path.join(result_path, cap_int_score_file)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 5, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "eval run 0\n" 157 | ] 158 | }, 159 | { 160 | "name": "stderr", 161 | "output_type": "stream", 162 | "text": [ 163 | " 0%| | 0/218 [00:00 0:\n", 187 | " for k, v in grade_results.items():\n", 188 | " if len(v['score']) < num_run:\n", 189 | " need_more_runs = True\n", 190 | " break\n", 191 | " return need_more_runs or len(grade_results) < len_data\n", 192 | "\n", 193 | "\n", 194 | "while need_more_runs():\n", 195 | " for j in range(num_run):\n", 196 | " print(f'eval run {j}')\n", 197 | " for id, line in tqdm(data.items()):\n", 198 | " if sub_set is not None and id not in sub_set:\n", 199 | " continue\n", 200 | " if id in grade_results and len(grade_results[id]['score']) >= (j + 1):\n", 201 | " continue\n", 202 | "\n", 203 | " model_pred = results[id]\n", 204 | " \n", 205 | " question = prompt + '\\n' + ' | '.join([line['question'], line['answer'].replace(\"\", \" \").replace(\"\", \" \"), model_pred, \"\"])\n", 206 | " messages = [\n", 207 | " {\"role\": \"user\", \"content\": question},\n", 208 | " ]\n", 209 | "\n", 210 | " if id not in grade_results:\n", 211 | " sample_grade = {'model': [], 'content': [], 'score': []}\n", 212 | " else:\n", 213 | " sample_grade = grade_results[id]\n", 214 | "\n", 215 | " \n", 216 | " grade_sample_run_complete = False\n", 217 | " temperature = 0.0\n", 218 | "\n", 219 | " while not grade_sample_run_complete:\n", 220 | " try:\n", 221 | " response = client.chat.completions.create(\n", 222 | " model=gpt_model,\n", 223 | " max_tokens=3,\n", 224 | " temperature=temperature,\n", 225 | " messages=messages)\n", 226 | " content = response.choices[0].message.content\n", 227 | " flag = True\n", 228 | " try_time = 1\n", 229 | " while flag:\n", 230 | " try:\n", 231 | " content = content.split(' ')[0].strip()\n", 232 | " score = float(content)\n", 233 | " if score > 1.0 or score < 0.0:\n", 234 | " assert False\n", 235 | " flag = False\n", 236 | " except:\n", 237 | " question = prompt + '\\n' + ' | '.join([line['question'], line['answer'].replace(\"\", \" \").replace(\"\", \" \"), model_pred, \"\"]) + \"\\nPredict the correctness of the answer (digit): \"\n", 238 | " messages = [\n", 239 | " {\"role\": \"user\", \"content\": question},\n", 240 | " ]\n", 241 | " response = client.chat.completions.create(\n", 242 | " model=gpt_model,\n", 243 | " max_tokens=3,\n", 244 | " temperature=temperature,\n", 245 | " messages=messages)\n", 246 | " content = response.choices[0].message.content\n", 247 | " try_time += 1\n", 248 | " temperature += 0.5\n", 249 | " print(f\"{id} try {try_time} times\")\n", 250 | " print(content)\n", 251 | " if try_time > 5:\n", 252 | " score = 0.0\n", 253 | " flag = False\n", 254 | " grade_sample_run_complete = True\n", 255 | " except:\n", 256 | " # gpt4 may have token rate limit\n", 257 | " print(\"sleep 30s\")\n", 258 | " time.sleep(30)\n", 259 | "\n", 260 | " if len(sample_grade['model']) >= j + 1:\n", 261 | " sample_grade['model'][j] = response.model\n", 262 | " sample_grade['content'][j] = content\n", 263 | " sample_grade['score'][j] = score\n", 264 | " else:\n", 265 | " sample_grade['model'].append(response.model)\n", 266 | " sample_grade['content'].append(content)\n", 267 | " sample_grade['score'].append(score)\n", 268 | " grade_results[id] = sample_grade\n", 269 | "\n", 270 | " with open(grade_file, 'w') as f:\n", 271 | " json.dump(grade_results, f, indent=4)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 6, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "assert not need_more_runs()\n", 281 | "cap_socres = {k: [0.0]*num_run for k in columns[:-2]}\n", 282 | "counter['total'] = len_data\n", 283 | "\n", 284 | "cap_socres2 = {k: [0.0]*num_run for k in columns2[:-2]}\n", 285 | "counter2 = {columns2[i]:cap_set_counter[i] for i in range(len(cap_set_counter))}\n", 286 | "counter2['total'] = len_data\n", 287 | "\n", 288 | "for k, v in grade_results.items():\n", 289 | " if sub_set is not None and k not in sub_set:\n", 290 | " continue\n", 291 | " for i in range(num_run):\n", 292 | " score = v['score'][i]\n", 293 | " caps = set(data[k]['capability'])\n", 294 | " for c in caps:\n", 295 | " cap_socres[c][i] += score\n", 296 | " \n", 297 | " cap_socres['total'][i] += score\n", 298 | "\n", 299 | " index = cap_set_list.index(caps)\n", 300 | " cap_socres2[cap_set_names[index]][i] += score\n", 301 | " cap_socres2['total'][i] += score\n", 302 | "\n", 303 | "for k, v in cap_socres.items():\n", 304 | " cap_socres[k] = np.array(v) / counter[k] *100\n", 305 | "\n", 306 | "\n", 307 | "std = round(cap_socres['total'].std(), decimal_places)\n", 308 | "total_copy = cap_socres['total'].copy()\n", 309 | "runs = str(list(np.round(total_copy, decimal_places)))\n", 310 | "\n", 311 | "for k, v in cap_socres.items():\n", 312 | " cap_socres[k] = round(v.mean(), decimal_places)\n", 313 | "\n", 314 | "cap_socres['std'] = std\n", 315 | "cap_socres['runs'] = runs\n", 316 | "df.loc[model] = cap_socres\n", 317 | "\n", 318 | "\n", 319 | "for k, v in cap_socres2.items():\n", 320 | " cap_socres2[k] = round(np.mean(np.array(v) / counter2[k] *100), decimal_places)\n", 321 | "cap_socres2['std'] = std\n", 322 | "cap_socres2['runs'] = runs\n", 323 | "df2.loc[model] = cap_socres2\n", 324 | "\n", 325 | "df.to_csv(cap_score_file)\n", 326 | "df2.to_csv(cap_int_score_file)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 7, 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/html": [ 337 | "
\n", 338 | "\n", 351 | "\n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | "
recocrknowgenspatmathtotalstdruns
llava_llama2_13b_chat39.723.227.130.430.87.733.30.0[33.3]
\n", 381 | "
" 382 | ], 383 | "text/plain": [ 384 | " rec ocr know gen spat math total std runs\n", 385 | "llava_llama2_13b_chat 39.7 23.2 27.1 30.4 30.8 7.7 33.3 0.0 [33.3]" 386 | ] 387 | }, 388 | "execution_count": 7, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "# when use subset, please note the column order is different from the full set\n", 395 | "# because it ranks by numbers of capabilties/capability integrations\n", 396 | "df" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 8, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/html": [ 407 | "
\n", 408 | "\n", 421 | "\n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | "
rec_know_genrecocr_spatocr_spat_mathrec_spatocrocr_mathrec_knowocr_rec_know_genocr_rec_spat_genocr_rec_spatocr_recocr_spat_knowrec_spat_knowocr_spat_genocr_rec_spat_mathtotalstdruns
llava_llama2_13b_chat30.559.523.514.358.331.70.027.85.060.028.650.033.30.010.00.033.30.0[33.3]
\n", 471 | "
" 472 | ], 473 | "text/plain": [ 474 | " rec_know_gen rec ocr_spat ocr_spat_math rec_spat \\\n", 475 | "llava_llama2_13b_chat 30.5 59.5 23.5 14.3 58.3 \n", 476 | "\n", 477 | " ocr ocr_math rec_know ocr_rec_know_gen \\\n", 478 | "llava_llama2_13b_chat 31.7 0.0 27.8 5.0 \n", 479 | "\n", 480 | " ocr_rec_spat_gen ocr_rec_spat ocr_rec ocr_spat_know \\\n", 481 | "llava_llama2_13b_chat 60.0 28.6 50.0 33.3 \n", 482 | "\n", 483 | " rec_spat_know ocr_spat_gen ocr_rec_spat_math total \\\n", 484 | "llava_llama2_13b_chat 0.0 10.0 0.0 33.3 \n", 485 | "\n", 486 | " std runs \n", 487 | "llava_llama2_13b_chat 0.0 [33.3] " 488 | ] 489 | }, 490 | "execution_count": 8, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "df2" 497 | ] 498 | } 499 | ], 500 | "metadata": { 501 | "kernelspec": { 502 | "display_name": "base", 503 | "language": "python", 504 | "name": "python3" 505 | }, 506 | "language_info": { 507 | "codemirror_mode": { 508 | "name": "ipython", 509 | "version": 3 510 | }, 511 | "file_extension": ".py", 512 | "mimetype": "text/x-python", 513 | "name": "python", 514 | "nbconvert_exporter": "python", 515 | "pygments_lexer": "ipython3", 516 | "version": "3.10.9" 517 | }, 518 | "orig_nbformat": 4 519 | }, 520 | "nbformat": 4, 521 | "nbformat_minor": 2 522 | } 523 | -------------------------------------------------------------------------------- /mm-vet_evaluator.py: -------------------------------------------------------------------------------- 1 | # in case you want to run this script independently 2 | 3 | import argparse 4 | from openai import OpenAI 5 | from openai._exceptions import RateLimitError 6 | import json 7 | import os 8 | from tqdm import tqdm 9 | import pandas as pd 10 | import numpy as np 11 | from collections import Counter 12 | import time 13 | import pathlib 14 | 15 | prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score. 16 | 17 | Question | Ground truth | Prediction | Correctness 18 | --- | --- | --- | --- 19 | What is x in the equation? | -1 -5 | x = 3 | 0.0 20 | What is x in the equation? | -1 -5 | x = -1 | 0.5 21 | What is x in the equation? | -1 -5 | x = -5 | 0.5 22 | What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5 23 | What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0 24 | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4 25 | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0 26 | """ 27 | 28 | 29 | def arg_parser(prompt=prompt): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "--mmvet_path", 33 | type=str, 34 | default="/path/to/mm-vet", 35 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 36 | ) 37 | parser.add_argument( 38 | "--result_file", 39 | type=str, 40 | default="results/llava_llama2_13b_chat.json", 41 | help="path to the model result file, must end with .json", 42 | ) 43 | parser.add_argument( 44 | "--result_path", 45 | type=str, 46 | default="results", 47 | help="path to save the grading results", 48 | ) 49 | parser.add_argument( 50 | "--openai_api_key", type=str, default=None, 51 | help="If not specified, use OPENAI_API_KEY environment variable." 52 | ) 53 | parser.add_argument( 54 | "--gpt_model", type=str, default="gpt-4-0613", help="gpt model name" 55 | ) 56 | parser.add_argument( 57 | "--prompt", type=str, default=prompt, help="prompt for the model" 58 | ) 59 | parser.add_argument( 60 | "--use_sub_set", 61 | action="store_true", 62 | help="use a subset of the data for debugging", 63 | ) 64 | parser.add_argument( 65 | "--decimal_places", 66 | type=int, 67 | default=1, 68 | help="number of decimal places to round to", 69 | ) 70 | parser.add_argument( 71 | "--num_run", 72 | type=int, 73 | default=1, 74 | help="we set it as 5 in the paper", 75 | ) 76 | args = parser.parse_args() 77 | return args 78 | 79 | def get_file_names(args, model, sub_set_name): 80 | # grade results for each sample to svae 81 | grade_file = f"{model}_{args.gpt_model}-grade-{args.num_run}runs.json" 82 | grade_file = os.path.join(args.result_path, grade_file) 83 | 84 | # score results regarding capabilities/capability integration to save 85 | cap_score_file = ( 86 | f"{model}_{sub_set_name}{args.gpt_model}-cap-score-{args.num_run}runs.csv" 87 | ) 88 | cap_score_file = os.path.join(args.result_path, cap_score_file) 89 | cap_int_score_file = f"{model}_{sub_set_name}{args.gpt_model}-cap-int-score-{args.num_run}runs.csv" 90 | cap_int_score_file = os.path.join(args.result_path, cap_int_score_file) 91 | return grade_file, cap_score_file, cap_int_score_file 92 | 93 | 94 | def load_metadata(args): 95 | if args.use_sub_set: 96 | bard_set_file = os.path.join(args.mmvet_path, "bard_set.json") 97 | with open(bard_set_file, "r") as f: 98 | sub_set = json.load(f) 99 | sub_set_name = "bardset" 100 | sub_set_name = sub_set_name + "_" 101 | else: 102 | sub_set = None 103 | sub_set_name = "" 104 | 105 | mmvet_metadata = os.path.join(args.mmvet_path, "mm-vet.json") 106 | with open(mmvet_metadata, "r") as f: 107 | data = json.load(f) 108 | 109 | counter = Counter() 110 | cap_set_list = [] 111 | cap_set_counter = [] 112 | len_data = 0 113 | for id, value in data.items(): 114 | if sub_set is not None and id not in sub_set: 115 | continue 116 | cap = value["capability"] 117 | cap = set(cap) 118 | counter.update(cap) 119 | if cap not in cap_set_list: 120 | cap_set_list.append(cap) 121 | cap_set_counter.append(1) 122 | else: 123 | cap_set_counter[cap_set_list.index(cap)] += 1 124 | 125 | len_data += 1 126 | 127 | sorted_list = counter.most_common() 128 | columns = [k for k, v in sorted_list] 129 | columns.append("total") 130 | columns.append("std") 131 | columns.append("runs") 132 | df = pd.DataFrame(columns=columns) 133 | 134 | cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter)) 135 | new_cap_set_list = [] 136 | new_cap_set_counter = [] 137 | for index in cap_set_sorted_indices: 138 | new_cap_set_list.append(cap_set_list[index]) 139 | new_cap_set_counter.append(cap_set_counter[index]) 140 | 141 | cap_set_list = new_cap_set_list 142 | cap_set_counter = new_cap_set_counter 143 | cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list] 144 | 145 | columns2 = cap_set_names 146 | columns2.append("total") 147 | columns2.append("std") 148 | columns2.append("runs") 149 | df2 = pd.DataFrame(columns=columns2) 150 | return ( 151 | sub_set, 152 | sub_set_name, 153 | data, 154 | counter, 155 | cap_set_list, 156 | cap_set_counter, 157 | len_data, 158 | df, 159 | df2, 160 | cap_set_names, 161 | ) 162 | 163 | 164 | def runs( 165 | args, 166 | grade_file, 167 | data, 168 | len_data, 169 | sub_set=None, 170 | ): 171 | with open(args.result_file) as f: 172 | results = json.load(f) 173 | if os.path.exists(grade_file): 174 | with open(grade_file, "r") as f: 175 | grade_results = json.load(f) 176 | else: 177 | grade_results = {} 178 | 179 | def need_more_runs(args, grade_results, len_data): 180 | need_more_runs = False 181 | if len(grade_results) > 0: 182 | for k, v in grade_results.items(): 183 | if len(v["score"]) < args.num_run: 184 | need_more_runs = True 185 | break 186 | return need_more_runs or len(grade_results) < len_data 187 | 188 | while need_more_runs(args, grade_results, len_data): 189 | for j in range(args.num_run): 190 | print(f"eval run {j}") 191 | for id, line in tqdm(data.items()): 192 | if sub_set is not None and id not in sub_set: 193 | continue 194 | if id in grade_results and len(grade_results[id]["score"]) >= (j + 1): 195 | continue 196 | 197 | model_pred = results[id] 198 | 199 | question = ( 200 | args.prompt 201 | + "\n" 202 | + " | ".join( 203 | [ 204 | line["question"], 205 | line["answer"] 206 | .replace("", " ") 207 | .replace("", " "), 208 | model_pred, 209 | "", 210 | ] 211 | ) 212 | ) 213 | messages = [ 214 | {"role": "user", "content": question}, 215 | ] 216 | 217 | if id not in grade_results: 218 | sample_grade = {"model": [], "content": [], "score": []} 219 | else: 220 | sample_grade = grade_results[id] 221 | 222 | grade_sample_run_complete = False 223 | temperature = 0.0 224 | 225 | while not grade_sample_run_complete: 226 | try: 227 | response = client.chat.completions.create( 228 | model=args.gpt_model, 229 | max_tokens=3, 230 | temperature=temperature, 231 | messages=messages, 232 | ) 233 | content = response.choices[0].message.content 234 | flag = True 235 | try_time = 1 236 | while flag: 237 | try: 238 | content = content.split(" ")[0].strip() 239 | score = float(content) 240 | if score > 1.0 or score < 0.0: 241 | assert False 242 | flag = False 243 | except: 244 | question = ( 245 | args.prompt 246 | + "\n" 247 | + " | ".join( 248 | [ 249 | line["question"], 250 | line["answer"] 251 | .replace("", " ") 252 | .replace("", " "), 253 | model_pred, 254 | "", 255 | ] 256 | ) 257 | + "\nPredict the correctness of the answer (digit): " 258 | ) 259 | messages = [ 260 | {"role": "user", "content": question}, 261 | ] 262 | response = client.chat.completions.create( 263 | model=args.gpt_model, 264 | max_tokens=3, 265 | temperature=temperature, 266 | messages=messages, 267 | ) 268 | content = response.choices[0].message.content 269 | try_time += 1 270 | temperature += 0.5 271 | print(f"{id} try {try_time} times") 272 | print(content) 273 | if try_time > 5: 274 | score = 0.0 275 | flag = False 276 | grade_sample_run_complete = True 277 | except RateLimitError as e: 278 | # gpt4 may have token rate limit 279 | print("sleep 30s") 280 | time.sleep(30) 281 | 282 | if len(sample_grade["model"]) >= j + 1: 283 | sample_grade["model"][j] = response.model 284 | sample_grade["content"][j] = content 285 | sample_grade["score"][j] = score 286 | else: 287 | sample_grade["model"].append(response.model) 288 | sample_grade["content"].append(content) 289 | sample_grade["score"].append(score) 290 | grade_results[id] = sample_grade 291 | 292 | with open(grade_file, "w") as f: 293 | json.dump(grade_results, f, indent=4) 294 | 295 | return grade_results 296 | 297 | 298 | def export_result(args, model, df, df2, grade_results, data, cap_set_counter, cap_set_names): 299 | columns = df.columns 300 | columns2 = df2.columns 301 | 302 | cap_socres = {k: [0.0] * args.num_run for k in columns[:-2]} 303 | counter["total"] = len_data 304 | 305 | cap_socres2 = {k: [0.0] * args.num_run for k in columns2[:-2]} 306 | counter2 = {columns2[i]: cap_set_counter[i] for i in range(len(cap_set_counter))} 307 | counter2["total"] = len_data 308 | 309 | for k, v in grade_results.items(): 310 | if sub_set is not None and k not in sub_set: 311 | continue 312 | for i in range(args.num_run): 313 | score = v["score"][i] 314 | caps = set(data[k]["capability"]) 315 | for c in caps: 316 | cap_socres[c][i] += score 317 | 318 | cap_socres["total"][i] += score 319 | 320 | index = cap_set_list.index(caps) 321 | cap_socres2[cap_set_names[index]][i] += score 322 | cap_socres2["total"][i] += score 323 | 324 | for k, v in cap_socres.items(): 325 | cap_socres[k] = np.array(v) / counter[k] * 100 326 | 327 | std = round(cap_socres["total"].std(), args.decimal_places) 328 | total_copy = cap_socres["total"].copy() 329 | runs = str(list(np.round(total_copy, args.decimal_places))) 330 | 331 | for k, v in cap_socres.items(): 332 | cap_socres[k] = round(v.mean(), args.decimal_places) 333 | 334 | cap_socres["std"] = std 335 | cap_socres["runs"] = runs 336 | df.loc[model] = cap_socres 337 | 338 | for k, v in cap_socres2.items(): 339 | cap_socres2[k] = round( 340 | np.mean(np.array(v) / counter2[k] * 100), args.decimal_places 341 | ) 342 | cap_socres2["std"] = std 343 | cap_socres2["runs"] = runs 344 | df2.loc[model] = cap_socres2 345 | 346 | df.to_csv(cap_score_file) 347 | df2.to_csv(cap_int_score_file) 348 | 349 | return df, df2 350 | 351 | 352 | if __name__ == "__main__": 353 | args = arg_parser() 354 | if args.openai_api_key: 355 | OPENAI_API_KEY = args.openai_api_key 356 | else: 357 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 358 | client = OpenAI( 359 | api_key=OPENAI_API_KEY 360 | ) 361 | 362 | if os.path.exists(args.result_file) is False: 363 | raise ValueError("Result file does not exist") 364 | if not args.result_file.endswith(('.json', '.JSON')): 365 | raise ValueError("Result file should be a json file") 366 | model = pathlib.Path(args.result_file).stem 367 | 368 | metadata = load_metadata(args) 369 | ( 370 | sub_set, 371 | sub_set_name, 372 | data, 373 | counter, 374 | cap_set_list, 375 | cap_set_counter, 376 | len_data, 377 | df, 378 | df2, 379 | cap_set_names, 380 | ) = metadata 381 | file_names = get_file_names(args, model, sub_set_name) 382 | ( 383 | grade_file, 384 | cap_score_file, 385 | cap_int_score_file, 386 | ) = file_names 387 | grade_results = runs( 388 | args, 389 | grade_file, 390 | data, 391 | len_data, 392 | sub_set, 393 | ) 394 | df, df2 = export_result( 395 | args, 396 | model, 397 | df, 398 | df2, 399 | grade_results, 400 | data, 401 | cap_set_counter, 402 | cap_set_names, 403 | ) 404 | print(df) 405 | print("\n") 406 | print(df2) 407 | print("\n") 408 | print(f"Grading results are saved in:\n{grade_file}\n{cap_score_file}\n{cap_int_score_file}") -------------------------------------------------------------------------------- /v2/README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 |

4 | 5 | 6 | # [MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities](https://arxiv.org/abs/2408.00765) 7 | 8 |

9 | [Paper] 10 | [Download Dataset] 11 | [Dataset on Hugging Face] 12 | [Leaderboard] 13 | [Online Evaluator] 14 |

15 | 16 | 17 | 18 | ![MM-Vet v2 examples](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/mm-vet-v2_examples.jpg) 19 | Figure 1: Four examples from MM-Vet v2. Compared with MM-Vet, MM-Vet v2 introduces more high-quality evaluation samples (e.g., (a) and (b)), and the ones with the new capability of image-text sequence understanding (e.g., (c) and (d)). 20 | 21 | The code is under the Apache 2.0 license, and the dataset is under the CC BY-NC 4.0 license. 22 | 23 | ## Evalute your model on MM-Vet v2 24 | **Step 0**: Install openai package with `pip install openai>=1` and get access GPT-4 API. If you have not access, you can try MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) (but it may wait for long time depending on number of users). 25 | 26 | **Step 1**: Download MM-Vet v2 data [here](https://github.com/yuweihao/MM-Vet/releases/download/v2/mm-vet-v2.zip) and unzip `unzip mm-vet-v2.zip`. 27 | 28 | **Step 2**: Infer your model on MM-Vet v2 and save your model outputs in json like [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json), or just use [gpt-4o-2024-05-13_detail-high.json](results/gpt-4o-2024-05-13_detail-high.json) as example to evaluate. We also release inference scripts for GPT-4, Claude and Gemini. 29 | 30 | ```bash 31 | image_detail=high # or auto, low refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding 32 | 33 | python inference/gpt4.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gpt-4o-2024-05-13 --image_detail ${image_detail} 34 | ``` 35 | 36 | ```bash 37 | python inference/claude.py --mmvetv2_path /path/to/mm-vet-v2 --model_name claude-3-5-sonnet-20240620 38 | ``` 39 | 40 | ```bash 41 | python inference/gemini.py --mmvetv2_path /path/to/mm-vet-v2 --model_name gemini-1.5-pro 42 | ``` 43 | 44 | **Step 3**: `git clone https://github.com/yuweihao/MM-Vet.git && cd MM-Vet/v2`, run LLM-based evaluator 45 | ```bash 46 | python mm-vet-v2_evaluator.py --mmvetv2_path /path/to/mm-vet-v2 --result_file results/gpt-4o-2024-05-13_detail-high.json 47 | ``` 48 | If you cannot access GPT-4 (gpt-4-0613), you can upload your model output results (json file) to MM-Vet v2 online evaluator [Hugging Face Space](https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator) to get the grading results. 49 | 50 | ## Some results 51 | ![MM-Vet v2 results](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/MM-Vet-v2_results.png) 52 | 53 | 54 | ## Some interesting samples 55 | 56 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_511_0.jpg) 57 | 58 | **Q**: As shown in the image, two iron balls are hanging on the Leaning Tower of Pisa, ball A weighs 20kg, and ball B weighs 5kg. If the ropes hanging them are cut at the same time and air resistance is ignored, which iron ball will land first? 59 | 60 | **GT**: A 61 | 62 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge 63 | 64 | --- 65 | 66 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_500_0.jpg) 67 | 68 | **Q**: How many feet do these animals have in total? 69 | 70 | **GT**: 10 71 | 72 | **Required capabilities**: Recognition, knowledge, math 73 | 74 | --- 75 | 76 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_495_0.jpg) 77 | 78 | **Q**: How many feet do these animals have in total? 79 | 80 | **GT**: 16 81 | 82 | **Required capabilities**: Recognition, knowledge, math 83 | 84 | --- 85 | 86 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_504_0.jpg) 87 | 88 | **Q**: Is it possible for the car to move with magnetic force according to the Physical laws? 89 | 90 | **GT**: yes 91 | 92 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge 93 | 94 | --- 95 | 96 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_505_0.jpg) 97 | 98 | **Q**: Which track should the trolley go on, A or B? 99 | 100 | **GT**: A 101 | 102 | **Required capabilities**: Recognition, spatial awareness 103 | 104 | --- 105 | 106 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_507_0.jpg) 107 | 108 | **Q**: Can we make sure the cat is alive before we open the box? 109 | 110 | **GT**: yes 111 | 112 | **Required capabilities**: Recognition, spatial awareness, knowledge 113 | 114 | --- 115 | 116 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_508_0.jpg) 117 | 118 | **Q**: From location A to location B, is it faster to go east or west? 119 | 120 | **GT**: east 121 | 122 | **Required capabilities**: Recognition, spatial awareness, knowledge 123 | 124 | --- 125 | 126 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_512_0.jpg) 127 | 128 | **Q**: Neglecting air buoyancy (vacuum), which side will go down, iron or cotton? 129 | 130 | **GT**: iron 131 | 132 | **Required capabilities**: Recognition, OCR, spatial awareness, knowledge 133 | 134 | --- 135 | 136 | ![MM-Vet v2 sample](https://raw.githubusercontent.com/yuweihao/misc/master/MM-Vet/v2_514_0.jpg) 137 | 138 | **Q**: How many dwarfs are there near Snow White in the image? 139 | 140 | **GT**: 6 141 | 142 | **Required capabilities**: Recognition, spatial awareness 143 | 144 | 145 | ## Citation 146 | ``` 147 | @article{yu2024mmvetv2, 148 | title={MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities}, 149 | author={Weihao Yu and Zhengyuan Yang and Lingfeng Ren and Linjie Li and Jianfeng Wang and Kevin Lin and Chung-Ching Lin and Zicheng Liu and Lijuan Wang and Xinchao Wang}, 150 | journal={arXiv preprint arXiv:2408.00765}, 151 | year={2024} 152 | } 153 | ``` -------------------------------------------------------------------------------- /v2/inference/claude.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | import base64 5 | import requests 6 | import argparse 7 | import anthropic 8 | 9 | 10 | # Function to encode the image 11 | def encode_image(image_path): 12 | with open(image_path, "rb") as image_file: 13 | return base64.b64encode(image_file.read()).decode('utf-8') 14 | 15 | 16 | class Claude: 17 | def __init__(self, api_key, 18 | model="claude-3-5-sonnet-20240620", temperature=0.0, 19 | max_tokens=512, system=None): 20 | self.model = model 21 | self.client = anthropic.Anthropic( 22 | api_key=api_key, 23 | ) 24 | self.system = system 25 | self.temperature = temperature 26 | self.max_tokens = max_tokens 27 | 28 | def get_response(self, image_folder, prompt="What's in this image?"): 29 | messages = [] 30 | content = [] 31 | queries = prompt.split("") 32 | img_num = 0 33 | for query in queries: 34 | query = query.strip() 35 | if query == "": 36 | continue 37 | if query.endswith((".jpg", ".png", ".jpeg")): 38 | image_path = os.path.join(image_folder, query) 39 | base64_image = encode_image(image_path) 40 | image_format = "png" if image_path.endswith('.png') else "jpeg" 41 | content.append( 42 | { 43 | "type": "image", 44 | "source": { 45 | "type": "base64", 46 | "media_type": f"image/{image_format}", 47 | "data": base64_image, 48 | } 49 | } 50 | ) 51 | img_num += 1 52 | else: 53 | content.append( 54 | { 55 | "type": "text", 56 | "text": query 57 | }, 58 | ) 59 | 60 | messages.append({ 61 | "role": "user", 62 | "content": content, 63 | }) 64 | 65 | payload = { 66 | "model": self.model, 67 | "messages": messages, 68 | "max_tokens": self.max_tokens, 69 | "temperature": self.temperature, 70 | } 71 | 72 | if self.system: 73 | payload["system"] = self.system 74 | 75 | response = self.client.messages.create(**payload) 76 | response_text = response.content[0].text 77 | return response_text 78 | 79 | 80 | def arg_parser(): 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument( 83 | "--mmvetv2_path", 84 | type=str, 85 | default="/path/to/mm-vet-v2", 86 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 87 | ) 88 | parser.add_argument( 89 | "--result_path", 90 | type=str, 91 | default="results", 92 | ) 93 | parser.add_argument( 94 | "--anthropic_api_key", type=str, default=None, 95 | help="refer to https://platform.openai.com/docs/quickstart?context=python" 96 | ) 97 | parser.add_argument( 98 | "--model_name", 99 | type=str, 100 | default="claude-3-5-sonnet-20240620", 101 | help="Claude model name", 102 | ) 103 | args = parser.parse_args() 104 | return args 105 | 106 | 107 | if __name__ == "__main__": 108 | args = arg_parser() 109 | model_name = args.model_name 110 | if os.path.exists(args.result_path) is False: 111 | os.makedirs(args.result_path) 112 | results_path = os.path.join(args.result_path, f"{model_name}.json") 113 | image_folder = os.path.join(args.mmvetv2_path, "images") 114 | meta_data = os.path.join(args.mmvetv2_path, "mm-vet-v2.json") 115 | 116 | 117 | if args.anthropic_api_key: 118 | ANTHROPIC_API_KEY = args.anthropic_api_key 119 | else: 120 | ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY') 121 | 122 | if ANTHROPIC_API_KEY is None: 123 | raise ValueError("Please set the ANTHROPIC_API_KEY environment variable or pass it as an argument") 124 | 125 | claude = Claude(ANTHROPIC_API_KEY, model=model_name) 126 | 127 | if os.path.exists(results_path): 128 | with open(results_path, "r") as f: 129 | results = json.load(f) 130 | else: 131 | results = {} 132 | 133 | with open(meta_data, "r") as f: 134 | data = json.load(f) 135 | 136 | for id in data: 137 | if id in results: 138 | continue 139 | prompt = data[id]["question"].strip() 140 | print(id) 141 | print(f"Prompt: {prompt}") 142 | response = claude.get_response(image_folder, prompt) 143 | print(f"Response: {response}") 144 | results[id] = response 145 | with open(results_path, "w") as f: 146 | json.dump(results, f, indent=4) 147 | -------------------------------------------------------------------------------- /v2/inference/cogagent.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a demo for using CogAgent and CogVLM in CLI 3 | Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required. 4 | In this demo, We us chat template, you can use others to replace such as 'vqa'. 5 | Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow. 6 | Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation. 7 | """ 8 | 9 | import argparse 10 | import torch 11 | import json 12 | import os 13 | from PIL import Image 14 | from transformers import AutoModelForCausalLM, LlamaTokenizer 15 | import pandas as pd 16 | from accelerate import ( 17 | init_empty_weights, 18 | infer_auto_device_map, 19 | load_checkpoint_and_dispatch, 20 | ) 21 | from utils import evaluate_on_mmvetv2, process_images_for_question 22 | 23 | 24 | class CogAgent: 25 | def __init__( 26 | self, 27 | model_name="THUDM/cogagent-chat-hf", 28 | tokenizer_name="", 29 | image_first=False, 30 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 31 | chat_format=True, 32 | ): 33 | self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 34 | 35 | self.tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) 36 | if args.bf16: 37 | self.torch_type = torch.bfloat16 38 | else: 39 | self.torch_type = torch.float16 40 | 41 | print( 42 | "========Use torch type as:{} with device:{}========\n\n".format( 43 | self.torch_type, self.DEVICE 44 | ) 45 | ) 46 | # tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5') 47 | with init_empty_weights(): 48 | model = AutoModelForCausalLM.from_pretrained( 49 | model_name, 50 | torch_dtype=self.torch_type, 51 | low_cpu_mem_usage=True, 52 | trust_remote_code=True, 53 | ) 54 | device_map = infer_auto_device_map( 55 | model, 56 | max_memory={0: "20GiB", 1: "20GiB"}, 57 | no_split_module_classes=["CogAgentDecoderLayer"], 58 | ) 59 | path = "~/.cache/huggingface/hub/models--THUDM--cogagent-chat-hf/snapshots/balabala" # typical, '~/.cache/huggingface/hub/models--THUDM--cogagent-chat-hf/snapshots/balabala' 60 | model = load_checkpoint_and_dispatch( 61 | model, 62 | path, 63 | device_map=device_map, 64 | ) 65 | self.model = model.eval() 66 | self.system_message = system_message 67 | self.chat_format = chat_format 68 | 69 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 70 | images = [] 71 | text_queries = [] 72 | queries = prompt.split("") 73 | for query in queries: 74 | query = query.strip() 75 | if query.endswith((".jpg", ".png", ".jpeg")): 76 | images.append(os.path.join(image_folder, query)) 77 | text_queries.append("") 78 | else: 79 | text_queries.append(query) 80 | text_query = "".join(text_queries) 81 | image = process_images_for_question(images).convert("RGB") 82 | input_by_model = self.model.build_conversation_input_ids( 83 | self.tokenizer, query=text_query, history=None, images=[image] 84 | ) 85 | inputs = { 86 | "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self.DEVICE), 87 | "token_type_ids": input_by_model["token_type_ids"] 88 | .unsqueeze(0) 89 | .to(self.DEVICE), 90 | "attention_mask": input_by_model["attention_mask"] 91 | .unsqueeze(0) 92 | .to(self.DEVICE), 93 | "images": [ 94 | [input_by_model["images"][0].to(self.DEVICE).to(self.torch_type)] 95 | ], 96 | } 97 | if "cross_images" in input_by_model and input_by_model["cross_images"]: 98 | inputs["cross_images"] = [ 99 | [input_by_model["cross_images"][0].to(self.DEVICE).to(self.torch_type)] 100 | ] 101 | 102 | # add any transformers params here. 103 | gen_kwargs = {"max_length": 2048, "temperature": 0.9, "do_sample": False} 104 | with torch.no_grad(): 105 | outputs = self.model.generate(**inputs, **gen_kwargs) 106 | outputs = outputs[:, inputs["input_ids"].shape[1] :] 107 | response = self.tokenizer.decode(outputs[0]) 108 | response = response.split("")[0] 109 | output_text = response 110 | return output_text 111 | 112 | 113 | def arg_parser(): 114 | parser = argparse.ArgumentParser() 115 | parser.add_argument( 116 | "--quant", choices=[4], type=int, default=None, help="quantization bits" 117 | ) 118 | parser.add_argument( 119 | "--model_name", 120 | type=str, 121 | default="THUDM/cogagent-chat-hf", 122 | help="pretrained ckpt", 123 | ) 124 | parser.add_argument( 125 | "--local_tokenizer", 126 | type=str, 127 | default="lmsys/vicuna-7b-v1.5", 128 | help="tokenizer path", 129 | ) 130 | parser.add_argument("--fp16", action="store_true") 131 | parser.add_argument("--bf16", action="store_true") 132 | parser.add_argument( 133 | "--mmvetv2_path", 134 | type=str, 135 | default="/path/to/mm-vet-v2", 136 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 137 | ) 138 | parser.add_argument( 139 | "--result_path", 140 | type=str, 141 | default="results", 142 | ) 143 | parser.add_argument( 144 | "--image_first", 145 | action="store_true", 146 | help="whether text", 147 | ) 148 | parser.add_argument( 149 | "--chat_format", 150 | action="store_true", 151 | help="whether to use chat format", 152 | ) 153 | args = parser.parse_args() 154 | return args 155 | 156 | 157 | # path = "/home/abc/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/e29dc3ba206d524bf8efbfc60d80fc4556ab0e3c" 158 | if __name__ == "__main__": 159 | args = arg_parser() 160 | 161 | model = CogAgent( 162 | args.model_name, args.local_tokenizer, image_first=args.image_first 163 | ) 164 | if args.image_first: 165 | args.model_name = args.model_name + "-image-first" 166 | if args.chat_format: 167 | args.model_name = args.model_name + "-chat-format" 168 | print(args) 169 | evaluate_on_mmvetv2(args, model) 170 | -------------------------------------------------------------------------------- /v2/inference/cogvlm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a demo for using CogAgent and CogVLM in CLI 3 | Make sure you have installed vicuna-7b-v1.5 tokenizer model (https://huggingface.co/lmsys/vicuna-7b-v1.5), full checkpoint of vicuna-7b-v1.5 LLM is not required. 4 | In this demo, We us chat template, you can use others to replace such as 'vqa'. 5 | Strongly suggest to use GPU with bfloat16 support, otherwise, it will be slow. 6 | Mention that only one picture can be processed at one conversation, which means you can not replace or insert another picture during the conversation. 7 | """ 8 | 9 | import argparse 10 | import torch 11 | import json 12 | import os 13 | from PIL import Image 14 | from transformers import AutoModelForCausalLM, LlamaTokenizer 15 | import pandas as pd 16 | 17 | from accelerate import ( 18 | init_empty_weights, 19 | infer_auto_device_map, 20 | load_checkpoint_and_dispatch, 21 | ) 22 | from utils import evaluate_on_mmvetv2, process_images_for_question 23 | 24 | 25 | class CogVLM: 26 | def __init__( 27 | self, 28 | model_name="THUDM/cogvlm-chat-hf", 29 | tokenizer_name="", 30 | image_first=False, 31 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 32 | chat_format=True, 33 | ): 34 | self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 35 | 36 | self.tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) 37 | if args.bf16: 38 | self.torch_type = torch.bfloat16 39 | else: 40 | self.torch_type = torch.float16 41 | 42 | print( 43 | "========Use torch type as:{} with device:{}========\n\n".format( 44 | self.torch_type, self.DEVICE 45 | ) 46 | ) 47 | 48 | with init_empty_weights(): 49 | model = AutoModelForCausalLM.from_pretrained( 50 | model_name, 51 | torch_dtype=self.torch_type, 52 | low_cpu_mem_usage=True, 53 | trust_remote_code=True, 54 | ) 55 | device_map = infer_auto_device_map( 56 | model, 57 | max_memory={0: "20GiB", 1: "20GiB"}, 58 | no_split_module_classes=["CogVLMDecoderLayer", "TransformerLayer"], 59 | ) 60 | path = ( 61 | "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala" 62 | ) 63 | 64 | model = load_checkpoint_and_dispatch( 65 | model, 66 | path, # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala' 67 | device_map=device_map, 68 | ) 69 | self.model = model.eval() 70 | self.system_message = system_message 71 | self.chat_format = chat_format 72 | 73 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 74 | images = [] 75 | text_queries = [] 76 | queries = prompt.split("") 77 | for query in queries: 78 | query = query.strip() 79 | if query.endswith((".jpg", ".png", ".jpeg")): 80 | images.append(os.path.join(image_folder, query)) 81 | text_queries.append("") 82 | else: 83 | text_queries.append(query) 84 | text_query = "".join(text_queries) 85 | image = process_images_for_question(images).convert("RGB") 86 | input_by_model = self.model.build_conversation_input_ids( 87 | self.tokenizer, query=text_query, history=None, images=[image] 88 | ) 89 | inputs = { 90 | "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self.DEVICE), 91 | "token_type_ids": input_by_model["token_type_ids"] 92 | .unsqueeze(0) 93 | .to(self.DEVICE), 94 | "attention_mask": input_by_model["attention_mask"] 95 | .unsqueeze(0) 96 | .to(self.DEVICE), 97 | "images": ( 98 | [[input_by_model["images"][0].to(self.DEVICE).to(self.torch_type)]] 99 | if image is not None 100 | else None 101 | ), 102 | } 103 | if "cross_images" in input_by_model and input_by_model["cross_images"]: 104 | inputs["cross_images"] = [ 105 | [input_by_model["cross_images"][0].to(self.DEVICE).to(self.torch_type)] 106 | ] 107 | 108 | # add any transformers params here. 109 | gen_kwargs = {"max_length": 2048, "do_sample": False} # "temperature": 0.9 110 | with torch.no_grad(): 111 | outputs = self.model.generate(**inputs, **gen_kwargs) 112 | outputs = outputs[:, inputs["input_ids"].shape[1] :] 113 | response = self.tokenizer.decode(outputs[0]) 114 | response = response.split("")[0].strip() 115 | output_text = response 116 | return output_text 117 | 118 | 119 | def arg_parser(): 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument( 122 | "--quant", choices=[4], type=int, default=None, help="quantization bits" 123 | ) 124 | parser.add_argument( 125 | "--model_name", 126 | type=str, 127 | default="THUDM/cogvlm-chat-hf", 128 | help="pretrained ckpt", 129 | ) 130 | parser.add_argument( 131 | "--local_tokenizer", 132 | type=str, 133 | default="lmsys/vicuna-7b-v1.5", 134 | help="tokenizer path", 135 | ) 136 | parser.add_argument("--fp16", action="store_true") 137 | parser.add_argument("--bf16", action="store_true") 138 | parser.add_argument( 139 | "--mmvetv2_path", 140 | type=str, 141 | default="/path/to/mm-vet-v2", 142 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 143 | ) 144 | parser.add_argument( 145 | "--result_path", 146 | type=str, 147 | default="results", 148 | ) 149 | parser.add_argument( 150 | "--image_first", 151 | action="store_true", 152 | help="whether text", 153 | ) 154 | parser.add_argument( 155 | "--chat_format", 156 | action="store_true", 157 | help="whether to use chat format", 158 | ) 159 | args = parser.parse_args() 160 | return args 161 | 162 | 163 | if __name__ == "__main__": 164 | args = arg_parser() 165 | 166 | model = CogVLM(args.model_name, args.local_tokenizer, image_first=args.image_first) 167 | if args.image_first: 168 | args.model_name = args.model_name + "-image-first" 169 | if args.chat_format: 170 | args.model_name = args.model_name + "-chat-format" 171 | print(args) 172 | evaluate_on_mmvetv2(args, model) 173 | -------------------------------------------------------------------------------- /v2/inference/emu2.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from accelerate import ( 5 | init_empty_weights, 6 | infer_auto_device_map, 7 | load_checkpoint_and_dispatch, 8 | ) 9 | import os 10 | import argparse 11 | from utils import evaluate_on_mmvetv2 12 | 13 | 14 | class Emu2: 15 | def __init__( 16 | self, 17 | model_name="BAAI/Emu2-Chat", 18 | image_first=False, 19 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 20 | chat_format=True, 21 | ): 22 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) # "BAAI/Emu2-Chat" 23 | # self.model = AutoModelForCausalLM.from_pretrained( 24 | # model_name, 25 | # torch_dtype=torch.bfloat16, 26 | # low_cpu_mem_usage=True, 27 | # trust_remote_code=True).to('cuda').eval() 28 | with init_empty_weights(): 29 | model = AutoModelForCausalLM.from_pretrained( 30 | model_name, 31 | torch_dtype=torch.bfloat16, 32 | low_cpu_mem_usage=True, 33 | trust_remote_code=True, 34 | ) 35 | device_map = infer_auto_device_map( 36 | model, 37 | max_memory={0: "16GIB", 1: "20GIB", 2: "20GIB", 3: "20GIB"}, 38 | no_split_module_classes=["Block", "LlamaDecoderLayer"], 39 | ) 40 | device_map["model.decoder.lm.lm_head"] = 0 41 | self.image_first = image_first 42 | self.model = load_checkpoint_and_dispatch( 43 | model, 44 | "/home/abc/.cache/huggingface/hub/models--BAAI--Emu2-Chat/snapshots/20ea30b04f8fee599cf97535e655c200df728501", 45 | device_map=device_map, 46 | ).eval() 47 | self.system_message = system_message 48 | self.chat_format = chat_format 49 | 50 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 51 | images = [] 52 | text_queries = [] 53 | queries = prompt.split("") 54 | for query in queries: 55 | query = query.strip() 56 | if query.endswith((".jpg", ".png", ".jpeg")): 57 | image_path = os.path.join(image_folder, query) 58 | images.append(Image.open(image_path).convert("RGB")) 59 | text_queries.append("[]") 60 | else: 61 | text_queries.append(query) 62 | 63 | if self.image_first: 64 | for i in range(1, len(text_queries)): 65 | if text_queries[i] == "[]" and ( 66 | text_queries[i - 1] != "[]" 67 | ): 68 | tmp = text_queries[i - 1] 69 | text_queries[i - 1] = text_queries[i] 70 | text_queries[i] = tmp 71 | text_query = "".join(text_queries) 72 | if self.chat_format: 73 | text_query = f"{self.system_message} [USER]: {text_query} [ASSISTANT]:" 74 | print(text_query) 75 | inputs = self.model.build_input_ids( 76 | text=[text_query], tokenizer=self.tokenizer, image=images 77 | ) 78 | 79 | with torch.no_grad(): 80 | outputs = self.model.generate( 81 | input_ids=inputs["input_ids"], 82 | attention_mask=inputs["attention_mask"], 83 | image=inputs["image"].to(torch.bfloat16), 84 | max_new_tokens=512, 85 | length_penalty=-1, 86 | ) 87 | output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] 88 | return output_text 89 | 90 | 91 | def arg_parser(): 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument( 94 | "--mmvetv2_path", 95 | type=str, 96 | default="/path/to/mm-vet-v2", 97 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 98 | ) 99 | parser.add_argument( 100 | "--result_path", 101 | type=str, 102 | default="results", 103 | ) 104 | parser.add_argument( 105 | "--model_name", 106 | type=str, 107 | default="BAAI/Emu2-Chat", 108 | help="pretrained ckpt", 109 | ) 110 | parser.add_argument( 111 | "--image_first", 112 | action="store_true", 113 | help="whether text", 114 | ) 115 | parser.add_argument( 116 | "--chat_format", 117 | action="store_true", 118 | help="whether to use chat format", 119 | ) 120 | args = parser.parse_args() 121 | return args 122 | 123 | 124 | if __name__ == "__main__": 125 | args = arg_parser() 126 | 127 | model = Emu2(args.model_name, image_first=args.image_first) 128 | if args.image_first: 129 | args.model_name = args.model_name + "-image-first" 130 | if args.chat_format: 131 | args.model_name = args.model_name + "-chat-format" 132 | evaluate_on_mmvetv2(args, model) 133 | -------------------------------------------------------------------------------- /v2/inference/gemini.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from pathlib import Path 4 | import argparse 5 | import json 6 | import google.generativeai as genai 7 | from utils import evaluate_on_mmvetv2 8 | 9 | 10 | class Gemini: 11 | def __init__(self, model="gemini-1.5-pro"): 12 | self.model = genai.GenerativeModel(model) 13 | 14 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 15 | 16 | content = [] 17 | queries = prompt.split("") 18 | img_num = 0 19 | for query in queries: 20 | if query.endswith((".jpg", ".png", ".jpeg")): 21 | image_path = Path(os.path.join(image_folder, query)) 22 | image = { 23 | 'mime_type': f'image/{image_path.suffix[1:].replace("jpg", "jpeg")}', 24 | 'data': image_path.read_bytes() 25 | } 26 | img_num += 1 27 | content.append(image) 28 | else: 29 | content.append(query) 30 | 31 | if img_num > 16: 32 | return "" 33 | # Query the model 34 | text = "" 35 | while len(text) < 1: 36 | try: 37 | response = self.model.generate_content( 38 | content 39 | ) 40 | try: 41 | text = response.text 42 | except: 43 | text = " " 44 | except Exception as error: 45 | print(error) 46 | print('Sleeping for 10 seconds') 47 | time.sleep(10) 48 | return text.strip() 49 | 50 | 51 | def arg_parser(): 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument( 54 | "--mmvetv2_path", 55 | type=str, 56 | default="/path/to/mm-vet", 57 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 58 | ) 59 | parser.add_argument( 60 | "--result_path", 61 | type=str, 62 | default="results", 63 | ) 64 | parser.add_argument( 65 | "--google_api_key", type=str, default=None, 66 | help="refer to https://ai.google.dev/tutorials/python_quickstart" 67 | ) 68 | parser.add_argument( 69 | "--model_name", 70 | type=str, 71 | default="gemini-1.5-pro", 72 | help="Gemini model name", 73 | ) 74 | args = parser.parse_args() 75 | return args 76 | 77 | 78 | if __name__ == "__main__": 79 | args = arg_parser() 80 | 81 | if args.google_api_key: 82 | GOOGLE_API_KEY = args.google_api_key 83 | else: 84 | GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY') 85 | 86 | if GOOGLE_API_KEY is None: 87 | raise ValueError("Please set the GOOGLE_API_KEY environment variable or pass it as an argument") 88 | 89 | genai.configure(api_key=GOOGLE_API_KEY) 90 | model = Gemini(model=args.model_name) 91 | 92 | evaluate_on_mmvetv2(args, model) 93 | 94 | 95 | -------------------------------------------------------------------------------- /v2/inference/gpt4.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import requests 4 | import argparse 5 | from utils import encode_image, evaluate_on_mmvetv2 6 | 7 | 8 | 9 | 10 | class GPT4: 11 | def __init__(self, api_key, model="gpt-4o-2024-05-13", image_detail="auto", 12 | system_text="You are a helpful assistant. Generate a short and concise response to the following image text pair."): 13 | self.api_key = api_key 14 | self.model = model 15 | self.image_detail = image_detail 16 | self.system_text = system_text 17 | self.headers = { 18 | "Content-Type": "application/json", 19 | "Authorization": f"Bearer {self.api_key}" 20 | } 21 | self.url = "https://api.openai.com/v1/chat/completions" 22 | 23 | def get_response(self, image_folder, prompt="What's in this image?"): 24 | messages = [] 25 | if self.system_text is not None or self.system_text != "": 26 | messages.append({ 27 | "role": "system", 28 | "content": [ 29 | { 30 | "type": "text", 31 | "text": self.system_text, 32 | }, 33 | ] 34 | }) 35 | 36 | content = [] 37 | queries = prompt.split("") 38 | img_num = 0 39 | for query in queries: 40 | query = query.strip() 41 | if query.endswith((".jpg", ".png", ".jpeg")): 42 | image_path = os.path.join(image_folder, query) 43 | base64_image = encode_image(image_path) 44 | image_format = "data:image/png;base64" if image_path.endswith('.png') else "data:image/jpeg;base64" 45 | content.append( 46 | { 47 | "type": "image_url", 48 | "image_url": { 49 | "url": f"{image_format},{base64_image}", 50 | "detail": self.image_detail, 51 | } 52 | } 53 | ) 54 | img_num += 1 55 | else: 56 | content.append( 57 | { 58 | "type": "text", 59 | "text": query 60 | }, 61 | ) 62 | 63 | messages.append({ 64 | "role": "user", 65 | "content": content, 66 | }) 67 | payload = { 68 | "model": self.model, 69 | "messages": messages, 70 | "max_tokens": 500, 71 | } 72 | 73 | response_text, retry, response_json, regular_time = '', 0, None, 30 74 | while len(response_text) < 1: 75 | retry += 1 76 | time.sleep(1) 77 | try: 78 | response = requests.post(self.url, headers=self.headers, json=payload) 79 | response_json = response.json() 80 | # print(response_json) 81 | except Exception as e: 82 | print(e) 83 | time.sleep(regular_time) 84 | continue 85 | if response.status_code != 200: 86 | print(response.headers,response.content) 87 | print(image_path) 88 | print(f"The response status code for is {response.status_code} (Not OK)") 89 | time.sleep(regular_time) 90 | continue 91 | if 'choices' not in response_json: 92 | time.sleep(regular_time) 93 | continue 94 | response_text = response_json["choices"][0]["message"]["content"] 95 | return response_json["choices"][0]["message"]["content"] 96 | 97 | 98 | def arg_parser(): 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument( 101 | "--mmvetv2_path", 102 | type=str, 103 | default="/path/to/mm-vet-v2", 104 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 105 | ) 106 | parser.add_argument( 107 | "--result_path", 108 | type=str, 109 | default="results", 110 | ) 111 | parser.add_argument( 112 | "--openai_api_key", type=str, default=None, 113 | help="refer to https://platform.openai.com/docs/quickstart?context=python" 114 | ) 115 | parser.add_argument( 116 | "--model_name", 117 | type=str, 118 | default="gpt-4o-2024-05-13", 119 | help="GPT model name", 120 | ) 121 | parser.add_argument( 122 | "--image_detail", 123 | type=str, 124 | default="auto", 125 | help="Refer to https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding", 126 | ) 127 | args = parser.parse_args() 128 | return args 129 | 130 | 131 | if __name__ == "__main__": 132 | args = arg_parser() 133 | 134 | if args.openai_api_key: 135 | OPENAI_API_KEY = args.openai_api_key 136 | else: 137 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 138 | 139 | if OPENAI_API_KEY is None: 140 | raise ValueError("Please set the OPENAI_API_KEY environment variable or pass it as an argument") 141 | 142 | model = GPT4(OPENAI_API_KEY, model=args.model_name, image_detail=args.image_detail) 143 | args.model_name = f"{args.model_name}_detail-{args.image_detail}" 144 | 145 | evaluate_on_mmvetv2(args, model) 146 | -------------------------------------------------------------------------------- /v2/inference/internvl.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor 4 | import os 5 | from accelerate import init_empty_weights, infer_auto_device_map 6 | import argparse 7 | from utils import evaluate_on_mmvetv2, process_images_for_question 8 | 9 | 10 | def disable_torch_init(): 11 | """ 12 | Disable the redundant torch default initialization to accelerate model creation. 13 | """ 14 | import torch 15 | 16 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 17 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 18 | 19 | 20 | class Internvl: 21 | def __init__( 22 | self, 23 | model_name="OpenGVLab/InternVL-Chat-V1-2", # OpenGVLab/InternVL-Chat-V1-2 24 | image_first=False, 25 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 26 | chat_format=True, 27 | ): 28 | random.seed(args.seed) 29 | if args.bf16: 30 | self.torch_type = torch.bfloat16 31 | else: 32 | self.torch_type = torch.float16 33 | self.model = AutoModel.from_pretrained( 34 | model_name, 35 | torch_dtype=self.torch_type, 36 | low_cpu_mem_usage=True, 37 | trust_remote_code=True, 38 | device_map="auto", 39 | ).eval() 40 | 41 | self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 42 | self.temperature = 0.0 43 | self.system_message = system_message 44 | self.chat_format = chat_format 45 | 46 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 47 | images = [] 48 | text_queries = [] 49 | queries = prompt.split("") 50 | for query in queries: 51 | query = query.strip() 52 | if query.endswith((".jpg", ".png", ".jpeg")): 53 | images.append(os.path.join(image_folder, query)) 54 | text_queries.append("") 55 | else: 56 | text_queries.append(query) 57 | text_query = "".join(text_queries) 58 | image = process_images_for_question(images).convert("RGB") 59 | image = image.resize((448, 448)) 60 | image_processor = CLIPImageProcessor.from_pretrained(self.model_name) 61 | 62 | pixel_values = image_processor(images=image, return_tensors="pt").pixel_values 63 | pixel_values = pixel_values.to(self.torch_type).cuda() 64 | 65 | generation_config = dict( 66 | num_beams=1, 67 | max_new_tokens=1024, 68 | do_sample=True if self.temperature > 0 else False, 69 | temperature=self.temperature, 70 | length_penalty=1.0, 71 | repetition_penalty=1.2, 72 | ) 73 | 74 | response = model.chat( 75 | self.tokenizer, pixel_values, text_query, generation_config 76 | ) 77 | return response 78 | 79 | 80 | def arg_parser(): 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument( 83 | "--mmvetv2_path", 84 | type=str, 85 | default="/path/to/mm-vet-v2", 86 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 87 | ) 88 | parser.add_argument( 89 | "--result_path", 90 | type=str, 91 | default="results", 92 | ) 93 | parser.add_argument("--fp16", action="store_true") 94 | parser.add_argument("--bf16", action="store_true") 95 | parser.add_argument( 96 | "--model_name", 97 | type=str, 98 | default="OpenGVLab/InternVL-Chat-V1-2", 99 | help="pretrained ckpt", 100 | ) 101 | parser.add_argument( 102 | "--image_first", 103 | action="store_true", 104 | help="whether text", 105 | ) 106 | parser.add_argument( 107 | "--chat_format", 108 | action="store_true", 109 | help="whether to use chat format", 110 | ) 111 | parser.add_argument("--seed", type=int, default=0) 112 | args = parser.parse_args() 113 | return args 114 | 115 | 116 | if __name__ == "__main__": 117 | args = arg_parser() 118 | 119 | model = Internvl(args.model_name, image_first=args.image_first) 120 | # model = None 121 | if args.image_first: 122 | args.model_name = args.model_name + "-image-first" 123 | if args.chat_format: 124 | args.model_name = args.model_name + "-chat-format" 125 | evaluate_on_mmvetv2(args, model) 126 | -------------------------------------------------------------------------------- /v2/inference/internvl2.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import torch 4 | from transformers import AutoModel, AutoTokenizer, CLIPImageProcessor 5 | import os 6 | from accelerate import init_empty_weights, infer_auto_device_map 7 | import argparse 8 | from utils import evaluate_on_mmvetv2, process_images_for_question 9 | import torchvision.transforms as T 10 | from PIL import Image, ImageDraw 11 | from torchvision.transforms.functional import InterpolationMode 12 | 13 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 14 | IMAGENET_STD = (0.229, 0.224, 0.225) 15 | 16 | 17 | def split_model(model_name): 18 | device_map = {} 19 | world_size = torch.cuda.device_count() 20 | num_layers = { 21 | "InternVL2-1B": 24, 22 | "InternVL2-2B": 24, 23 | "InternVL2-4B": 32, 24 | "InternVL2-8B": 32, 25 | "InternVL2-26B": 48, 26 | "InternVL2-40B": 60, 27 | "InternVL2-Llama3-76B": 80, 28 | }[model_name] 29 | # Since the first GPU will be used for ViT, treat it as half a GPU. 30 | num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) 31 | num_layers_per_gpu = [num_layers_per_gpu] * world_size 32 | num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) 33 | layer_cnt = 0 34 | for i, num_layer in enumerate(num_layers_per_gpu): 35 | for j in range(num_layer): 36 | device_map[f"language_model.model.layers.{layer_cnt}"] = i 37 | layer_cnt += 1 38 | device_map["vision_model"] = 0 39 | device_map["mlp1"] = 0 40 | device_map["language_model.model.tok_embeddings"] = 0 41 | device_map["language_model.model.embed_tokens"] = 0 42 | device_map["language_model.output"] = 0 43 | device_map["language_model.model.norm"] = 0 44 | device_map["language_model.lm_head"] = 0 45 | device_map[f"language_model.model.layers.{num_layers - 1}"] = 0 46 | 47 | return device_map 48 | 49 | 50 | def build_transform(input_size): 51 | MEAN, STD = IMAGENET_MEAN, IMAGENET_STD 52 | transform = T.Compose( 53 | [ 54 | T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), 55 | T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), 56 | T.ToTensor(), 57 | T.Normalize(mean=MEAN, std=STD), 58 | ] 59 | ) 60 | return transform 61 | 62 | 63 | def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): 64 | best_ratio_diff = float("inf") 65 | best_ratio = (1, 1) 66 | area = width * height 67 | for ratio in target_ratios: 68 | target_aspect_ratio = ratio[0] / ratio[1] 69 | ratio_diff = abs(aspect_ratio - target_aspect_ratio) 70 | if ratio_diff < best_ratio_diff: 71 | best_ratio_diff = ratio_diff 72 | best_ratio = ratio 73 | elif ratio_diff == best_ratio_diff: 74 | if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: 75 | best_ratio = ratio 76 | return best_ratio 77 | 78 | 79 | def dynamic_preprocess( 80 | image, min_num=1, max_num=6, image_size=448, use_thumbnail=False 81 | ): 82 | orig_width, orig_height = image.size 83 | aspect_ratio = orig_width / orig_height 84 | 85 | # calculate the existing image aspect ratio 86 | target_ratios = set( 87 | (i, j) 88 | for n in range(min_num, max_num + 1) 89 | for i in range(1, n + 1) 90 | for j in range(1, n + 1) 91 | if i * j <= max_num and i * j >= min_num 92 | ) 93 | target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) 94 | 95 | # find the closest aspect ratio to the target 96 | target_aspect_ratio = find_closest_aspect_ratio( 97 | aspect_ratio, target_ratios, orig_width, orig_height, image_size 98 | ) 99 | 100 | # calculate the target width and height 101 | target_width = image_size * target_aspect_ratio[0] 102 | target_height = image_size * target_aspect_ratio[1] 103 | blocks = target_aspect_ratio[0] * target_aspect_ratio[1] 104 | 105 | # resize the image 106 | resized_img = image.resize((target_width, target_height)) 107 | processed_images = [] 108 | for i in range(blocks): 109 | box = ( 110 | (i % (target_width // image_size)) * image_size, 111 | (i // (target_width // image_size)) * image_size, 112 | ((i % (target_width // image_size)) + 1) * image_size, 113 | ((i // (target_width // image_size)) + 1) * image_size, 114 | ) 115 | # split the image 116 | split_img = resized_img.crop(box) 117 | processed_images.append(split_img) 118 | assert len(processed_images) == blocks 119 | if use_thumbnail and len(processed_images) != 1: 120 | thumbnail_img = image.resize((image_size, image_size)) 121 | processed_images.append(thumbnail_img) 122 | return processed_images 123 | 124 | 125 | def load_image(image_file, input_size=448, max_num=6): 126 | if isinstance(image_file, str): 127 | image = Image.open(image_file).convert("RGB") 128 | else: 129 | image = image_file 130 | transform = build_transform(input_size=input_size) 131 | images = dynamic_preprocess( 132 | image, image_size=input_size, use_thumbnail=True, max_num=max_num 133 | ) 134 | pixel_values = [transform(image) for image in images] 135 | pixel_values = torch.stack(pixel_values) 136 | return pixel_values 137 | 138 | 139 | def disable_torch_init(): 140 | """ 141 | Disable the redundant torch default initialization to accelerate model creation. 142 | """ 143 | import torch 144 | 145 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 146 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 147 | 148 | 149 | class Internvl: 150 | def __init__( 151 | self, 152 | model_name="OpenGVLab/InternVL2-40B", # OpenGVLab/InternVL-Chat-V1-5 OpenGVLab/InternVL2-40B OpenGVLab/InternVL2-Llama3-76B 153 | image_first=False, 154 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 155 | chat_format=True, 156 | ): 157 | random.seed(args.seed) 158 | if args.bf16: 159 | self.torch_type = torch.bfloat16 160 | else: 161 | self.torch_type = torch.float16 162 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1" 163 | try: 164 | self.model = AutoModel.from_pretrained( 165 | pretrained_model_name_or_path=model_name, 166 | torch_dtype=self.torch_type, 167 | low_cpu_mem_usage=True, 168 | trust_remote_code=True, 169 | device_map="auto", 170 | ).eval() 171 | except Exception: 172 | device_map = split_model( 173 | model_name.split("/")[-1] 174 | ) # "InternVL2-Llama3-76B" 175 | self.model = AutoModel.from_pretrained( 176 | model_name, 177 | torch_dtype=self.torch_type, 178 | low_cpu_mem_usage=True, 179 | trust_remote_code=True, 180 | device_map=device_map, 181 | ).eval() 182 | self.tokenizer = AutoTokenizer.from_pretrained( 183 | model_name, use_fast=False, trust_remote_code=True 184 | ) 185 | 186 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 187 | images = [] 188 | text_queries = [] 189 | queries = prompt.split("") 190 | pixel_values = [] 191 | num_patches_list = [] 192 | for query in queries: 193 | query = query.strip() 194 | if query.endswith((".jpg", ".png", ".jpeg")): 195 | images.append(os.path.join(image_folder, query)) 196 | pixel_values.append( 197 | load_image(os.path.join(image_folder, query), max_num=6) 198 | .to(self.torch_type) 199 | .cuda() 200 | ) 201 | num_patches_list.append(pixel_values[-1].size(0)) 202 | else: 203 | text_queries.append(query) 204 | text_query = "".join(text_queries) 205 | if args.unique: 206 | question = "" 207 | for i in range(len(pixel_values)): 208 | idx = i + 1 209 | question += f"Image-{idx}: \n" 210 | question += text_query 211 | else: 212 | question = f"\n{text_query}" 213 | pixel_values = torch.cat(pixel_values, dim=0) 214 | generation_config = dict( 215 | num_beams=1, 216 | max_new_tokens=512, 217 | min_new_tokens=1, 218 | do_sample=True if args.temperature > 0 else False, 219 | temperature=args.temperature, 220 | length_penalty=1.0, 221 | repetition_penalty=1.2, 222 | ) 223 | try: 224 | response, history = model.chat( 225 | self.tokenizer, 226 | pixel_values, 227 | question, 228 | generation_config, 229 | num_patches_list=num_patches_list if args.unique else None, 230 | history=None, 231 | return_history=True, 232 | ) 233 | except Exception as e: 234 | combined_images = process_images_for_question(images).convert("RGB") 235 | pixel_values = ( 236 | load_image(combined_images, max_num=6).to(self.torch_type).cuda() 237 | ) 238 | response, history = model.chat( 239 | self.tokenizer, 240 | pixel_values, 241 | question, 242 | generation_config, 243 | history=None, 244 | return_history=True, 245 | ) 246 | print(f"found error: {e}, combine images to save space") 247 | return response 248 | 249 | 250 | def arg_parser(): 251 | parser = argparse.ArgumentParser() 252 | parser.add_argument( 253 | "--mmvetv2_path", 254 | type=str, 255 | default="/path/to/mm-vet-v2", 256 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 257 | ) 258 | parser.add_argument( 259 | "--result_path", 260 | type=str, 261 | default="results", 262 | ) 263 | parser.add_argument("--fp16", action="store_true") 264 | parser.add_argument("--bf16", action="store_true") 265 | parser.add_argument( 266 | "--model_name", 267 | type=str, 268 | default="OpenGVLab/InternVL2-40B", 269 | help="pretrained ckpt", 270 | ) 271 | parser.add_argument( 272 | "--image_first", 273 | action="store_true", 274 | help="whether text", 275 | ) 276 | parser.add_argument( 277 | "--chat_format", 278 | action="store_true", 279 | help="whether to use chat format", 280 | ) 281 | parser.add_argument("--temperature", type=float, default=0.0) 282 | parser.add_argument("--unique", action="store_true") 283 | 284 | parser.add_argument("--seed", type=int, default=0) 285 | args = parser.parse_args() 286 | return args 287 | 288 | 289 | if __name__ == "__main__": 290 | args = arg_parser() 291 | 292 | model = Internvl(args.model_name, image_first=args.image_first) 293 | if args.image_first: 294 | args.model_name = args.model_name + "-image-first" 295 | if args.chat_format: 296 | args.model_name = args.model_name + "-chat-format" 297 | evaluate_on_mmvetv2(args, model) 298 | -------------------------------------------------------------------------------- /v2/inference/ixc2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModel, AutoTokenizer 3 | import argparse 4 | 5 | import os 6 | import torch 7 | import torchvision 8 | from PIL import Image 9 | from utils import evaluate_on_mmvetv2, process_images_for_question 10 | 11 | 12 | def auto_configure_device_map(num_gpus): 13 | # visual_encoder 算4层 14 | # internlm_model.model.embed_tokens 占用1层 15 | # norm 和 lm_head 占用1层 16 | # transformer.layers 占用 32 层 17 | # 总共34层分配到num_gpus张卡上 18 | num_trans_layers = 32 19 | per_gpu_layers = 38 / num_gpus 20 | 21 | device_map = { 22 | "vit": 0, 23 | "vision_proj": 0, 24 | "model.tok_embeddings": 0, 25 | "model.norm": num_gpus - 1, 26 | "output": num_gpus - 1, 27 | } 28 | 29 | used = 3 30 | gpu_target = 0 31 | for i in range(num_trans_layers): 32 | if used >= per_gpu_layers: 33 | gpu_target += 1 34 | used = 0 35 | assert gpu_target < num_gpus 36 | device_map[f"model.layers.{i}"] = gpu_target 37 | used += 1 38 | 39 | return device_map 40 | 41 | 42 | def model_gen_single_img(model, text, images, need_bos=True, padding=False): 43 | pt1 = 0 44 | embeds = [] 45 | im_mask = [] 46 | images = images 47 | images_loc = [0] 48 | for i, pts in enumerate(images_loc + [len(text)]): 49 | subtext = text[pt1:pts] 50 | if need_bos or len(subtext) > 0: 51 | text_embeds = model.encode_text(subtext, add_special_tokens=need_bos) 52 | embeds.append(text_embeds) 53 | im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda()) 54 | need_bos = False 55 | if i < len(images): 56 | try: 57 | image = Image.open(images[i]).convert("RGB") 58 | except: 59 | image = images[i].convert("RGB") 60 | if padding: 61 | image = __padding__(image) 62 | image = model.vis_processor(image).unsqueeze(0).half().cuda() 63 | image_embeds = model.encode_img(image) 64 | embeds.append(image_embeds) 65 | im_mask.append(torch.ones(image_embeds.shape[:2]).cuda()) 66 | pt1 = pts 67 | embeds = torch.cat(embeds, dim=1) 68 | im_mask = torch.cat(im_mask, dim=1) 69 | im_mask = im_mask.bool() 70 | 71 | outputs = model.generate( 72 | inputs_embeds=embeds, 73 | im_mask=im_mask, 74 | temperature=1.0, 75 | max_new_tokens=4096, 76 | num_beams=3, 77 | do_sample=False, 78 | repetition_penalty=1.0, 79 | ) 80 | 81 | output_token = outputs[0] 82 | if output_token[0] == 0 or output_token[0] == 1: 83 | output_token = output_token[1:] 84 | output_text = model.tokenizer.decode(output_token, add_special_tokens=False) 85 | output_text = output_text.split("[UNUSED_TOKEN_145]")[0].strip() 86 | return output_text 87 | 88 | 89 | def model_gen_multi_img(model, text, images, need_bos=True, padding=False): 90 | embeds = [] 91 | im_mask = [] 92 | images = images 93 | for i, pts in enumerate(text): 94 | text_embeds = model.encode_text( 95 | pts, add_special_tokens=need_bos if i == 0 else False 96 | ) 97 | embeds.append(text_embeds) 98 | im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda()) 99 | if i < len(images): 100 | assert os.path.exists(images[i]) 101 | try: 102 | image = Image.open(images[i]).convert("RGB") 103 | except: 104 | image = images[i].convert("RGB") 105 | if padding: 106 | image = __padding__(image) 107 | image = model.vis_processor(image).unsqueeze(0).cuda() 108 | image_embeds = model.encode_img(image) 109 | embeds.append(image_embeds) 110 | im_mask.append(torch.ones(image_embeds.shape[:2]).cuda()) 111 | embeds = torch.cat(embeds, dim=1) 112 | im_mask = torch.cat(im_mask, dim=1) 113 | im_mask = im_mask.bool() 114 | outputs = model.generate( 115 | inputs_embeds=embeds, 116 | im_mask=im_mask, 117 | temperature=1.0, 118 | max_new_tokens=4096, 119 | num_beams=3, 120 | do_sample=False, 121 | repetition_penalty=1.0, 122 | ) 123 | output_token = outputs[0] 124 | if output_token[0] == 0 or output_token[0] == 1: 125 | output_token = output_token[1:] 126 | output_text = model.tokenizer.decode(output_token, add_special_tokens=False) 127 | output_text = output_text.split("[UNUSED_TOKEN_145]")[0].strip() 128 | return output_text 129 | 130 | 131 | def __padding__(image): 132 | width, height = image.size 133 | tar = max(width, height) 134 | top_padding = int((tar - height) / 2) 135 | bottom_padding = tar - height - top_padding 136 | left_padding = int((tar - width) / 2) 137 | right_padding = tar - width - left_padding 138 | image = torchvision.transforms.functional.pad( 139 | image, [left_padding, top_padding, right_padding, bottom_padding] 140 | ) 141 | return image 142 | 143 | 144 | class InternLM_XComposer2_VL: 145 | def __init__( 146 | self, 147 | model_name="internlm/internlm-xcomposer2-vl-7b", 148 | image_first=False, 149 | system_message="You are a helpful assistant, dedicated to delivering comprehensive and meticulous responses.", 150 | chat_format=True, 151 | ): 152 | self.model = AutoModel.from_pretrained( 153 | model_name, trust_remote_code=True 154 | ).eval() 155 | 156 | if args.dtype == "fp16": 157 | self.model.half().cuda() 158 | elif args.dtype == "fp32": 159 | self.model.cuda() 160 | 161 | if args.num_gpus > 1: 162 | from accelerate import dispatch_model 163 | 164 | device_map = auto_configure_device_map(args.num_gpus) 165 | self.model = dispatch_model(self.model, device_map=device_map) 166 | 167 | self.tokenizer = AutoTokenizer.from_pretrained( 168 | model_name, trust_remote_code=True 169 | ) 170 | self.model.tokenizer = self.tokenizer 171 | self.system_message = system_message 172 | self.chat_format = chat_format 173 | 174 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 175 | images = [] 176 | text_queries = [] 177 | queries = prompt.split("") 178 | for query in queries: 179 | query = query.strip() 180 | if query.endswith((".jpg", ".png", ".jpeg")): 181 | images.append(os.path.join(image_folder, query)) 182 | text_queries.append("") 183 | else: 184 | text_queries.append(query) 185 | if args.combine_imgs: 186 | text_query = "".join(text_queries) 187 | text = "[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n".format( 188 | self.system_message, text_query 189 | ) 190 | image = [process_images_for_question(images)] 191 | response = model_gen_single_img( 192 | model=self.model, 193 | text=text, 194 | images=image, 195 | ) 196 | else: 197 | text_query = ( 198 | [ 199 | "[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n".format( 200 | self.system_message 201 | ) 202 | ] 203 | + text_queries 204 | + [ 205 | "{}Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n" 206 | ] 207 | ) 208 | with torch.cuda.amp.autocast(): 209 | response = model_gen_multi_img( 210 | model=self.model, text=text_query, images=images 211 | ) 212 | return response 213 | 214 | 215 | def arg_parser(): 216 | parser = argparse.ArgumentParser() 217 | parser.add_argument( 218 | "--mmvetv2_path", 219 | type=str, 220 | default="/path/to/mm-vet-v2", 221 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 222 | ) 223 | parser.add_argument( 224 | "--result_path", 225 | type=str, 226 | default="results", 227 | ) 228 | parser.add_argument( 229 | "--model_name", 230 | type=str, 231 | default="internlm/internlm-xcomposer2-vl-7b", 232 | help="pretrained ckpt", 233 | ) 234 | parser.add_argument( 235 | "--image_first", 236 | action="store_true", 237 | help="whether text", 238 | ) 239 | parser.add_argument( 240 | "--chat_format", 241 | action="store_true", 242 | help="whether to use chat format", 243 | ) 244 | parser.add_argument( 245 | "--combine_imgs", 246 | action="store_true", 247 | help="whether to use chat format", 248 | ) 249 | parser.add_argument("--num_gpus", default=2, type=int) 250 | parser.add_argument("--dtype", default="fp16", type=str) 251 | args = parser.parse_args() 252 | return args 253 | 254 | 255 | if __name__ == "__main__": 256 | args = arg_parser() 257 | meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔). 258 | - InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. 259 | - InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文. 260 | - InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.""" 261 | 262 | model = InternLM_XComposer2_VL( 263 | args.model_name, image_first=args.image_first, system_message=meta_instruction 264 | ) 265 | if args.image_first: 266 | args.model_name = args.model_name + "-image-first" 267 | if args.chat_format: 268 | args.model_name = args.model_name + "-chat-format" 269 | evaluate_on_mmvetv2(args, model) 270 | -------------------------------------------------------------------------------- /v2/inference/open_flamingo.py: -------------------------------------------------------------------------------- 1 | from open_flamingo import create_model_and_transforms 2 | from PIL import Image 3 | import torch 4 | import os 5 | import argparse 6 | from utils import evaluate_on_mmvetv2 7 | 8 | 9 | class OpenFlamingo: 10 | def __init__(self, model_name='open-flamingo-9b'): 11 | if model_name == 'open-flamingo-9b': 12 | clip_vision_encoder_path="ViT-L-14" 13 | clip_vision_encoder_pretrained="openai" 14 | lang_encoder_path="anas-awadalla/mpt-7b" 15 | tokenizer_path="anas-awadalla/mpt-7b" 16 | cross_attn_every_n_layers=4 17 | self.model, self.image_processor, self.tokenizer = create_model_and_transforms( 18 | clip_vision_encoder_path=clip_vision_encoder_path, 19 | clip_vision_encoder_pretrained=clip_vision_encoder_pretrained, 20 | lang_encoder_path=lang_encoder_path, 21 | tokenizer_path=tokenizer_path, 22 | cross_attn_every_n_layers=cross_attn_every_n_layers, 23 | ) 24 | 25 | self.tokenizer.padding_side = "left" # For generation padding tokens should be on the left 26 | 27 | def get_response(self, image_folder, prompt="What's in this image?") -> str: 28 | vision_x = [] 29 | text_query = "" 30 | queries = prompt.split("") 31 | for query in queries: 32 | query = query.strip() 33 | if query.endswith((".jpg", ".png", ".jpeg")): 34 | image_path = os.path.join(image_folder, query) 35 | image = Image.open(image_path).convert('RGB') 36 | vision_x.append(self.image_processor(image).unsqueeze(0)) 37 | text_query += "" 38 | else: 39 | text_query += query 40 | 41 | vision_x = torch.cat(vision_x, dim=0) 42 | vision_x = vision_x.unsqueeze(1).unsqueeze(0) 43 | 44 | lang_x = self.tokenizer( 45 | [text_query], 46 | return_tensors="pt", 47 | ) 48 | 49 | generated_text = self.model.generate( 50 | vision_x=vision_x, 51 | lang_x=lang_x["input_ids"], 52 | attention_mask=lang_x["attention_mask"], 53 | max_new_tokens=512, 54 | num_beams=3, 55 | ) 56 | 57 | response_text = self.tokenizer.decode(generated_text[0]) 58 | return response_text.strip() 59 | 60 | 61 | def arg_parser(): 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument( 64 | "--mmvetv2_path", 65 | type=str, 66 | default="/path/to/mm-vet-v2", 67 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 68 | ) 69 | parser.add_argument( 70 | "--result_path", 71 | type=str, 72 | default="results", 73 | ) 74 | parser.add_argument( 75 | "--model_name", 76 | type=str, 77 | default="open-flamingo-9b", 78 | help="Open Flamingo model name", 79 | ) 80 | args = parser.parse_args() 81 | return args 82 | 83 | if __name__ == "__main__": 84 | args = arg_parser() 85 | 86 | model = OpenFlamingo(args.model_name) 87 | evaluate_on_mmvetv2(args, model) 88 | -------------------------------------------------------------------------------- /v2/inference/qwen.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | import base64 5 | import requests 6 | import argparse 7 | from utils import evaluate_on_mmvetv2 8 | from http import HTTPStatus 9 | import dashscope 10 | 11 | 12 | class Qwen: 13 | def __init__(self, model='qwen-vl-max'): 14 | self.model = model 15 | 16 | def get_response(self, image_folder, prompt="What's in this image?"): 17 | messages = [] 18 | content = [] 19 | queries = prompt.split("") 20 | img_num = 0 21 | for query in queries: 22 | query = query.strip() 23 | if query == "": 24 | continue 25 | if query.endswith((".jpg", ".png", ".jpeg")): 26 | image_path = os.path.join(image_folder, query) 27 | content.append( 28 | { 29 | "image": f"file://{image_path}" 30 | } 31 | ) 32 | img_num += 1 33 | else: 34 | content.append( 35 | { 36 | "text": query 37 | }, 38 | ) 39 | 40 | messages.append({ 41 | "role": "user", 42 | "content": content, 43 | }) 44 | 45 | payload = { 46 | "model": self.model, 47 | "messages": messages, 48 | } 49 | 50 | 51 | response = dashscope.MultiModalConversation.call(**payload) 52 | if response.status_code == HTTPStatus.OK: 53 | rps = response['output']['choices'][0]['message']['content'] 54 | for rp in rps: 55 | if 'text' in rp: 56 | response_text = rp['text'] 57 | return response_text.strip() 58 | else: 59 | print(response.code) # The error code. 60 | print(response.message) # The error message. 61 | return "" 62 | 63 | def arg_parser(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument( 66 | "--mmvetv2_path", 67 | type=str, 68 | default="/path/to/mm-vet-v2", 69 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 70 | ) 71 | parser.add_argument( 72 | "--result_path", 73 | type=str, 74 | default="results", 75 | ) 76 | parser.add_argument( 77 | "--dashscope_api_key", type=str, default=None, 78 | help="refer to https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start" 79 | ) 80 | parser.add_argument( 81 | "--model_name", 82 | type=str, 83 | default="qwen-vl-max", 84 | help="Qwen model name", 85 | ) 86 | args = parser.parse_args() 87 | return args 88 | 89 | 90 | if __name__ == "__main__": 91 | args = arg_parser() 92 | 93 | if args.dashscope_api_key: 94 | DASHSCOPE_API_KEY = args.dashscope_api_key 95 | else: 96 | DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY') 97 | 98 | if DASHSCOPE_API_KEY is None: 99 | raise ValueError("Please set the DASHSCOPE_API_KEY environment variable or pass it as an argument") 100 | 101 | model = Qwen(model=args.model_name) 102 | 103 | evaluate_on_mmvetv2(args, model) 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /v2/inference/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import json 4 | import base64 5 | from PIL import Image, ImageDraw 6 | 7 | 8 | def process_images_for_question(images, key=None): 9 | images = [Image.open(path) for path in images] # 10 | if not images: 11 | return # 12 | n = len(images) 13 | grid_cols = math.ceil(math.sqrt(n)) 14 | grid_rows = math.ceil(n / grid_cols) 15 | 16 | # 17 | max_width = max(img.width for img in images) 18 | max_height = max(img.height for img in images) 19 | cell_width = max_width + 20 # add gap 20 | cell_height = max_height + 30 # 21 | 22 | # 23 | collage_width = cell_width * grid_cols 24 | collage_height = cell_height * grid_rows 25 | collage = Image.new("RGB", (collage_width, collage_height), "white") 26 | draw = ImageDraw.Draw(collage) 27 | 28 | for index, img in enumerate(images): 29 | row, col = divmod(index, grid_cols) 30 | x = col * cell_width + (cell_width - img.width) // 2 31 | y = row * cell_height + (cell_height - img.height - 10) // 2 # 32 | collage.paste(img, (x, y + 20)) # 33 | 34 | # add img id 35 | draw.text((x + img.width // 2, y), str(index + 1), fill="black") 36 | 37 | return collage 38 | 39 | 40 | # Function to encode the image 41 | def encode_image(image_path): 42 | with open(image_path, "rb") as image_file: 43 | return base64.b64encode(image_file.read()).decode("utf-8") 44 | 45 | 46 | def evaluate_on_mmvetv2(args, model): 47 | if os.path.exists(args.result_path) is False: 48 | os.makedirs(args.result_path) 49 | 50 | model_name = args.model_name.replace("/", "--") 51 | results_path = os.path.join(args.result_path, f"{model_name}.json") 52 | image_folder = os.path.join(args.mmvetv2_path, "images") 53 | meta_data = os.path.join(args.mmvetv2_path, "mm-vet-v2.json") 54 | 55 | if os.path.exists(results_path): 56 | with open(results_path, "r") as f: 57 | results = json.load(f) 58 | else: 59 | results = {} 60 | 61 | with open(meta_data, "r") as f: 62 | data = json.load(f) 63 | 64 | for i in range(len(data)): 65 | id = f"v2_{i}" 66 | if id in results: 67 | continue 68 | prompt = data[id]["question"].strip() 69 | print(id) 70 | print(f"Prompt: {prompt}") 71 | try: 72 | response = model.get_response(image_folder, prompt) 73 | except: 74 | response = "" 75 | print(f"Response: {response}") 76 | results[id] = response 77 | with open(results_path, "w") as f: 78 | json.dump(results, f, indent=4) 79 | -------------------------------------------------------------------------------- /v2/mm-vet-v2_evaluator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from openai import OpenAI 3 | from openai._exceptions import RateLimitError, BadRequestError 4 | import json 5 | import os 6 | from tqdm import tqdm 7 | import pandas as pd 8 | import numpy as np 9 | from collections import Counter 10 | import time 11 | import pathlib 12 | 13 | prompt = """Compare the ground truth and prediction from AI models, to give a correctness score for the prediction. in the question indicates where an image is. in the ground truth means it is totally right only when all elements in the ground truth are present in the prediction, and means it is totally right when any one element in the ground truth is present in the prediction. The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, or 1.0 (totally right). Just complete the last space of the correctness score. 14 | 15 | | Question | Ground truth | Prediction | Correctness | 16 | | --- | --- | --- | --- | 17 | | What is x in the equation? | -1 -5 | x = 3 | 0.0 | 18 | | What is x in the equation? | -1 -5 | x = -1 | 0.5 | 19 | | What is x in the equation? | -1 -5 | x = -5 | 0.5 | 20 | | What is x in the equation? | -1 -5 | x = -5 or 5 | 0.5 | 21 | | What is x in the equation? | -1 -5 | x = -1 or x = -5 | 1.0 | 22 | | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme talks about Iceland and Greenland. It's pointing out that despite their names, Iceland is not very icy and Greenland isn't very green. | 0.4 | 23 | | Can you explain this meme? | This meme is poking fun at the fact that the names of the countries Iceland and Greenland are misleading. Despite its name, Iceland is known for its beautiful green landscapes, while Greenland is mostly covered in ice and snow. The meme is saying that the person has trust issues because the names of these countries do not accurately represent their landscapes. | The meme is using humor to point out the misleading nature of Iceland's and Greenland's names. Iceland, despite its name, has lush green landscapes while Greenland is mostly covered in ice and snow. The text 'This is why I have trust issues' is a playful way to suggest that these contradictions can lead to distrust or confusion. The humor in this meme is derived from the unexpected contrast between the names of the countries and their actual physical characteristics. | 1.0 | 24 | """ 25 | 26 | 27 | def arg_parser(prompt=prompt): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument( 30 | "--mmvetv2_path", 31 | type=str, 32 | default="/path/to/mm-vet-v2", 33 | help="Download mm-vet.zip and `unzip mm-vet.zip` and change the path here", 34 | ) 35 | parser.add_argument( 36 | "--result_file", 37 | type=str, 38 | default="results/llava_llama2_13b_chat.json", 39 | help="path to the model result file, must end with .json", 40 | ) 41 | parser.add_argument( 42 | "--result_path", 43 | type=str, 44 | default="results", 45 | help="path to save the grading results", 46 | ) 47 | parser.add_argument( 48 | "--openai_api_key", type=str, default=None, 49 | help="If not specified, use OPENAI_API_KEY environment variable." 50 | ) 51 | parser.add_argument( 52 | "--gpt_model", type=str, default="gpt-4-0613", help="gpt model name" 53 | ) 54 | parser.add_argument( 55 | "--prompt", type=str, default=prompt, help="prompt for the model" 56 | ) 57 | parser.add_argument( 58 | "--subset", 59 | type=str, 60 | default=None, 61 | help="path to json where contains ids to evaluate", 62 | ) 63 | parser.add_argument( 64 | "--decimal_places", 65 | type=int, 66 | default=1, 67 | help="number of decimal places to round to", 68 | ) 69 | parser.add_argument( 70 | "--num_run", 71 | type=int, 72 | default=1, 73 | help="we set it as 5 in the paper", 74 | ) 75 | args = parser.parse_args() 76 | return args 77 | 78 | def get_file_names(args, model, subset_name): 79 | # grade results for each sample to svae 80 | grade_file = f"{model}_{args.gpt_model}-grade-{args.num_run}runs_dev8.json" 81 | grade_file = os.path.join(args.result_path, grade_file) 82 | 83 | # score results regarding capabilities/capability integration to save 84 | cap_score_file = ( 85 | f"{model}_{subset_name}{args.gpt_model}-cap-score-{args.num_run}runs_dev8.csv" 86 | ) 87 | cap_score_file = os.path.join(args.result_path, cap_score_file) 88 | cap_int_score_file = f"{model}_{subset_name}{args.gpt_model}-cap-int-score-{args.num_run}runs_dev8.csv" 89 | cap_int_score_file = os.path.join(args.result_path, cap_int_score_file) 90 | return grade_file, cap_score_file, cap_int_score_file 91 | 92 | 93 | def load_metadata(args): 94 | if args.subset: 95 | with open(args.subset, "r") as f: 96 | subset = json.load(f) 97 | 98 | subset_name = pathlib.Path(args.subset).stem 99 | subset_name = subset_name + "_" 100 | else: 101 | subset = None 102 | subset_name = "" 103 | 104 | mmvet_metadata = os.path.join(args.mmvetv2_path, "mm-vet-v2.json") 105 | with open(mmvet_metadata, "r") as f: 106 | data = json.load(f) 107 | 108 | counter = Counter() 109 | cap_set_list = [] 110 | cap_set_counter = [] 111 | len_data = 0 112 | for id, value in data.items(): 113 | if subset is not None and id not in subset: 114 | continue 115 | cap = value["capability"] 116 | cap = set(cap) 117 | counter.update(cap) 118 | if cap not in cap_set_list: 119 | cap_set_list.append(cap) 120 | cap_set_counter.append(1) 121 | else: 122 | cap_set_counter[cap_set_list.index(cap)] += 1 123 | 124 | len_data += 1 125 | 126 | sorted_list = counter.most_common() 127 | columns = [k for k, v in sorted_list] 128 | columns.append("total") 129 | columns.append("std") 130 | columns.append("runs") 131 | df = pd.DataFrame(columns=columns) 132 | 133 | cap_set_sorted_indices = np.argsort(-np.array(cap_set_counter)) 134 | new_cap_set_list = [] 135 | new_cap_set_counter = [] 136 | for index in cap_set_sorted_indices: 137 | new_cap_set_list.append(cap_set_list[index]) 138 | new_cap_set_counter.append(cap_set_counter[index]) 139 | 140 | cap_set_list = new_cap_set_list 141 | cap_set_counter = new_cap_set_counter 142 | cap_set_names = ["_".join(list(cap_set)) for cap_set in cap_set_list] 143 | 144 | columns2 = cap_set_names 145 | columns2.append("total") 146 | columns2.append("std") 147 | columns2.append("runs") 148 | df2 = pd.DataFrame(columns=columns2) 149 | return ( 150 | subset, 151 | subset_name, 152 | data, 153 | counter, 154 | cap_set_list, 155 | cap_set_counter, 156 | len_data, 157 | df, 158 | df2, 159 | cap_set_names, 160 | ) 161 | 162 | 163 | def runs( 164 | args, 165 | grade_file, 166 | data, 167 | len_data, 168 | subset=None, 169 | ): 170 | with open(args.result_file) as f: 171 | results = json.load(f) 172 | if os.path.exists(grade_file): 173 | with open(grade_file, "r") as f: 174 | grade_results = json.load(f) 175 | else: 176 | grade_results = {} 177 | 178 | def need_more_runs(args, grade_results, len_data): 179 | need_more_runs = False 180 | if len(grade_results) > 0: 181 | for k, v in grade_results.items(): 182 | if len(v["score"]) < args.num_run: 183 | need_more_runs = True 184 | break 185 | return need_more_runs or len(grade_results) < len_data 186 | 187 | while need_more_runs(args, grade_results, len_data): 188 | for j in range(args.num_run): 189 | print(f"eval run {j}") 190 | for id, line in tqdm(data.items()): 191 | if subset is not None and id not in subset: 192 | continue 193 | if id in grade_results and len(grade_results[id]["score"]) >= (j + 1): 194 | continue 195 | 196 | model_pred = results[id] 197 | queries = line['question'].split('') 198 | query = "" 199 | for q in queries: 200 | if q.endswith((".jpg", "jpeg", ".png")): 201 | query += "" 202 | else: 203 | query += q 204 | question = prompt + '| ' + ' | '.join([query.replace('\n', '
'), line['answer'].replace("", " ").replace("", " ").replace('\n', '
'), model_pred.replace('\n', '
'), ""]) 205 | # print(question) 206 | messages = [ 207 | {"role": "user", "content": question}, 208 | ] 209 | 210 | if id not in grade_results: 211 | sample_grade = {"model": [], "content": [], "score": []} 212 | else: 213 | sample_grade = grade_results[id] 214 | 215 | grade_sample_run_complete = False 216 | temperature = 0.0 217 | 218 | while not grade_sample_run_complete: 219 | try: 220 | response = client.chat.completions.create( 221 | model=args.gpt_model, 222 | max_tokens=3, 223 | temperature=temperature, 224 | messages=messages, 225 | ) 226 | content = response.choices[0].message.content 227 | # print(content) 228 | flag = True 229 | try_time = 1 230 | while flag: 231 | try: 232 | content = content.split(" ")[0].strip() 233 | score = float(content) 234 | if score > 1.0 or score < 0.0: 235 | assert False 236 | flag = False 237 | except: 238 | question_try = question + "\n\nPredict the correctness of the answer (digit): " 239 | messages = [ 240 | {"role": "user", "content": question_try}, 241 | ] 242 | # print(question_try) 243 | response = client.chat.completions.create( 244 | model=args.gpt_model, 245 | max_tokens=3, 246 | temperature=temperature, 247 | messages=messages, 248 | ) 249 | content = response.choices[0].message.content 250 | # print(content) 251 | try_time += 1 252 | temperature += 0.5 253 | print(f"{id} try {try_time} times") 254 | print(content) 255 | if try_time > 5: 256 | score = 0.0 257 | flag = False 258 | grade_sample_run_complete = True 259 | response_model = response.model 260 | except RateLimitError as e: 261 | # gpt4 may have token rate limit 262 | print("sleep 30s") 263 | time.sleep(30) 264 | except BadRequestError as e: 265 | content = "BadRequestError" 266 | score = 0.0 267 | flag = False 268 | print(id, "BadRequestError") 269 | response_model = args.gpt_model 270 | break 271 | 272 | if len(sample_grade["model"]) >= j + 1: 273 | sample_grade["model"][j] = response_model 274 | sample_grade["content"][j] = content 275 | sample_grade["score"][j] = score 276 | else: 277 | sample_grade["model"].append(response_model) 278 | sample_grade["content"].append(content) 279 | sample_grade["score"].append(score) 280 | grade_results[id] = sample_grade 281 | 282 | with open(grade_file, "w") as f: 283 | json.dump(grade_results, f, indent=4) 284 | 285 | return grade_results 286 | 287 | 288 | def export_result(args, model, df, df2, grade_results, data, cap_set_counter, cap_set_names): 289 | columns = df.columns 290 | columns2 = df2.columns 291 | 292 | cap_socres = {k: [0.0] * args.num_run for k in columns[:-2]} 293 | counter["total"] = len_data 294 | 295 | cap_socres2 = {k: [0.0] * args.num_run for k in columns2[:-2]} 296 | counter2 = {columns2[i]: cap_set_counter[i] for i in range(len(cap_set_counter))} 297 | counter2["total"] = len_data 298 | 299 | for k, v in grade_results.items(): 300 | if subset is not None and k not in subset: 301 | continue 302 | for i in range(args.num_run): 303 | score = v["score"][i] 304 | caps = set(data[k]["capability"]) 305 | for c in caps: 306 | cap_socres[c][i] += score 307 | 308 | cap_socres["total"][i] += score 309 | 310 | index = cap_set_list.index(caps) 311 | cap_socres2[cap_set_names[index]][i] += score 312 | cap_socres2["total"][i] += score 313 | 314 | for k, v in cap_socres.items(): 315 | cap_socres[k] = np.array(v) / counter[k] * 100 316 | 317 | std = round(cap_socres["total"].std(), args.decimal_places) 318 | total_copy = cap_socres["total"].copy() 319 | runs = str(list(np.round(total_copy, args.decimal_places))) 320 | 321 | for k, v in cap_socres.items(): 322 | cap_socres[k] = round(v.mean(), args.decimal_places) 323 | 324 | cap_socres["std"] = std 325 | cap_socres["runs"] = runs 326 | df.loc[model] = cap_socres 327 | 328 | for k, v in cap_socres2.items(): 329 | cap_socres2[k] = round( 330 | np.mean(np.array(v) / counter2[k] * 100), args.decimal_places 331 | ) 332 | cap_socres2["std"] = std 333 | cap_socres2["runs"] = runs 334 | df2.loc[model] = cap_socres2 335 | 336 | df.to_csv(cap_score_file) 337 | df2.to_csv(cap_int_score_file) 338 | 339 | return df, df2 340 | 341 | 342 | if __name__ == "__main__": 343 | args = arg_parser() 344 | 345 | if args.openai_api_key: 346 | OPENAI_API_KEY = args.openai_api_key 347 | else: 348 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 349 | client = OpenAI( 350 | api_key=OPENAI_API_KEY 351 | ) 352 | 353 | if os.path.exists(args.result_file) is False: 354 | import pdb; pdb.set_trace() 355 | raise ValueError("Result file does not exist") 356 | if not args.result_file.endswith(('.json', '.JSON')): 357 | raise ValueError("Result file should be a json file") 358 | model = pathlib.Path(args.result_file).stem 359 | 360 | metadata = load_metadata(args) 361 | ( 362 | subset, 363 | subset_name, 364 | data, 365 | counter, 366 | cap_set_list, 367 | cap_set_counter, 368 | len_data, 369 | df, 370 | df2, 371 | cap_set_names, 372 | ) = metadata 373 | file_names = get_file_names(args, model, subset_name) 374 | ( 375 | grade_file, 376 | cap_score_file, 377 | cap_int_score_file, 378 | ) = file_names 379 | grade_results = runs( 380 | args, 381 | grade_file, 382 | data, 383 | len_data, 384 | subset, 385 | ) 386 | df, df2 = export_result( 387 | args, 388 | model, 389 | df, 390 | df2, 391 | grade_results, 392 | data, 393 | cap_set_counter, 394 | cap_set_names, 395 | ) 396 | print(df) 397 | print("\n") 398 | print(df2) 399 | print("\n") 400 | print(f"Grading results are saved in:\n{grade_file}\n{cap_score_file}\n{cap_int_score_file}") --------------------------------------------------------------------------------