├── asset ├── sta.jpg ├── name_logo.jpg ├── Highlights-1.png ├── Highlights-2.png ├── Highlights-3.png ├── Highlights-4.png ├── results_of_video_type.jpg ├── results_of_question_type.png ├── results_of_various_models.png ├── results_of_video_sub_type.png └── results_of_question_types_0616.png ├── evaluation └── output_test_template.json └── README.md /asset/sta.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/sta.jpg -------------------------------------------------------------------------------- /asset/name_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/name_logo.jpg -------------------------------------------------------------------------------- /asset/Highlights-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-1.png -------------------------------------------------------------------------------- /asset/Highlights-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-2.png -------------------------------------------------------------------------------- /asset/Highlights-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-3.png -------------------------------------------------------------------------------- /asset/Highlights-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-4.png -------------------------------------------------------------------------------- /asset/results_of_video_type.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_video_type.jpg -------------------------------------------------------------------------------- /asset/results_of_question_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_question_type.png -------------------------------------------------------------------------------- /asset/results_of_various_models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_various_models.png -------------------------------------------------------------------------------- /asset/results_of_video_sub_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_video_sub_type.png -------------------------------------------------------------------------------- /asset/results_of_question_types_0616.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_question_types_0616.png -------------------------------------------------------------------------------- /evaluation/output_test_template.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "video_id": "001", 4 | "duration": "short", 5 | "domain": "Knowledge", 6 | "sub_category": "Humanity & History", 7 | "questions": [ 8 | { 9 | "question_id": "001-1", 10 | "task_type": "Counting Problem", 11 | "question": "When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries, which kind of the decoration has the largest number?", 12 | "options": [ 13 | "A. Apples.", 14 | "B. Candles.", 15 | "C. Berries.", 16 | "D. The three kinds are of the same number." 17 | ], 18 | "answer": "C", 19 | "response": "C. Berries.", 20 | }, 21 | { 22 | "question_id": "001-2", 23 | "task_type": "Information Synopsis", 24 | "question": "What is the genre of this video?", 25 | "options": [ 26 | "A. It is a news report that introduces the history behind Christmas decorations.", 27 | "B. It is a documentary on the evolution of Christmas holiday recipes.", 28 | "C. It is a travel vlog exploring Christmas markets around the world.", 29 | "D. It is a tutorial on DIY Christmas ornament crafting." 30 | ], 31 | "answer": "A", 32 | "response": "D.", 33 | }, 34 | { 35 | "question_id": "001-3", 36 | "task_type": "Counting Problem", 37 | "question": "How many red socks are above the fireplace at the end of this video?", 38 | "options": [ 39 | "A. 1.", 40 | "B. 4.", 41 | "C. 2.", 42 | "D. 3." 43 | ], 44 | "answer": "D", 45 | "response": "D. 3", 46 | } 47 | ] 48 | }, 49 | { 50 | "video_id": "002", 51 | "duration": "short", 52 | "domain": "Knowledge", 53 | "sub_category": "Humanity & History", 54 | "questions": [ 55 | { 56 | "question_id": "002-1", 57 | "task_type": "Object Recognition", 58 | "question": "Which of the following features/items is not discussed in the video in relation to the tomb?", 59 | "options": [ 60 | "A. Inkstone.", 61 | "B. Niche.", 62 | "C. Jade.", 63 | "D. Sacrificial table." 64 | ], 65 | "answer": "C", 66 | "response": "Answer: C. Jade.", 67 | }, 68 | { 69 | "question_id": "002-2", 70 | "task_type": "Action Reasoning", 71 | "question": "Which of the following reasons motivated the archaeologists to excavate the tomb?", 72 | "options": [ 73 | "A. Because it's from Ming Dynasty and of specific archaeological significance.", 74 | "B. Because a new railway line will be built nearby.", 75 | "C. Because there were treasures inside the tomb.", 76 | "D. Highway realignment." 77 | ], 78 | "answer": "D", 79 | "response": "D", 80 | }, 81 | { 82 | "question_id": "002-3", 83 | "task_type": "Counting Problem", 84 | "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?", 85 | "options": [ 86 | "A. 4.", 87 | "B. 9.", 88 | "C. 5.", 89 | "D. 13." 90 | ], 91 | "answer": "B", 92 | "response": "D. 13", 93 | } 94 | ] 95 | }, 96 | { 97 | "video_id": "003", 98 | "duration": "short", 99 | "domain": "Knowledge", 100 | "sub_category": "Humanity & History", 101 | "questions": [ 102 | { 103 | "question_id": "003-1", 104 | "task_type": "Counting Problem", 105 | "question": "How many national flags appear in the video?", 106 | "options": [ 107 | "A. 3.", 108 | "B. 4.", 109 | "C. 2.", 110 | "D. 5." 111 | ], 112 | "answer": "B", 113 | "response": "B", 114 | }, 115 | { 116 | "question_id": "003-2", 117 | "task_type": "Object Recognition", 118 | "question": "What is the video telling when the burger placed in the upper right corner at the end of the video first appears?", 119 | "options": [ 120 | "A. Beef with spices came from Russia to Germany.", 121 | "B. The steak began to be sandwiched between two pieces of bread.", 122 | "C. Steak burgers spread throughout the United States.", 123 | "D. The standardization of hamburgers." 124 | ], 125 | "answer": "C", 126 | "response": "C.", 127 | }, 128 | { 129 | "question_id": "003-3", 130 | "task_type": "Object Reasoning", 131 | "question": "In which country is the food featured in the video recognized worldwide?", 132 | "options": [ 133 | "A. Mongolia.", 134 | "B. Russia.", 135 | "C. Germany.", 136 | "D. United States." 137 | ], 138 | "answer": "D", 139 | "response": "D. United States.", 140 | } 141 | ] 142 | }, 143 | ] 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis 2 | 3 | ![VideoQA](https://img.shields.io/badge/Task-VideoQA-red) 4 | ![Multi-Modal](https://img.shields.io/badge/Task-Multi--Modal-red) 5 | ![Video-MME](https://img.shields.io/badge/Dataset-Video--MME-blue) 6 | ![Gemini](https://img.shields.io/badge/Model-Gemini-green) 7 | ![GPT-4V](https://img.shields.io/badge/Model-GPT--4V-green) 8 | ![GPT-4o](https://img.shields.io/badge/Model-GPT--4o-green) 9 | 10 |

11 | 12 |

13 | 14 |
[[🍎 Project Page](https://video-mme.github.io/)] [[📖 Paper](https://arxiv.org/pdf/2405.21075)] [[📊 Dataset](https://github.com/BradyFU/Video-MME?tab=readme-ov-file#-dataset)] [[📖 MME-Survey](https://arxiv.org/pdf/2411.15296)] [[🏆 Leaderboard](https://video-mme.github.io/home_page.html#leaderboard)]
15 | 16 | Video-MME applies to both **image MLLMs**, i.e., generalizing to multiple images, and **video MLLMs**. 🌟 17 | 18 | We are very proud to launch [**MME-Survey**](https://arxiv.org/pdf/2411.15296) (jointly introduced by **MME**, **MMBench**, and **LLaVA** teams), a comprehensive survey on evaluation of Multimodal LLMs! 🔥🔥 19 | 20 | 21 | --- 22 | 23 | ## 🔥 News 24 | * **`2025.12.05`** 🌟 [**Gemini 3 Pro**](https://blog.google/technology/developers/gemini-3-pro-vision/) has used our Video-MME as the benchmark of video. 25 | * **`2025.08.07`** 🌟 [**GPT-5**](https://openai.com/zh-Hans-CN/index/introducing-gpt-5-for-developers/) has used our Video-MME as the benchmark of multimodal long context. 26 | * **`2025.05.06`** 🌟 [**Gemini 2.5 Pro**](https://developers.googleblog.com/en/gemini-2-5-pro-io-improved-coding-performance/) has used our Video-MME as the benchmark of video understanding: "Gemini 2.5 Pro delivers state-of-the-art video understanding, scoring 84.8% on the VideoMME benchmark". 27 | * **`2025.04.14`** 🌟 Video-MME has been introduced and used by [**OpenAI GPT-4.1**](https://openai.com/index/gpt-4-1/) as an **"industry standard measure"** of multimodal long context ability. 28 | * **`2025.02.27`** 🌟 Video-MME has been accepted by CVPR 2025. 29 | * **`2024.06.15`** 🌟 We have refreshed our evaluation: 1) replace broken and potentially broken video links, and re-annotated them; 2) GPT-4o now samples 384 frames (previously 10 from the website) at 512x512 resolution, boosting overall accuracy to 71.9%. 30 | * **`2024.06.03`** 🌟 We are very proud to launch Video-MME, the first-ever comprehensive evaluation benchmark of MLLMs in Video Analysis! 31 | 32 | 33 | 34 | ## 👀 Video-MME Overview 35 | 36 | In the quest for artificial general intelligence, Multi-modal Large Language Models (MLLMs) have emerged as a focal point in recent advancements, but their potential in processing sequential visual data is still insufficiently explored. We introduce Video-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of MLLMs in Video analysis. It is designed to comprehensively assess the capabilities of MLLMs in processing video data, covering a wide range of visual domains, temporal durations, and data modalities. Video-MME comprises **900 videos** with a total of 254 hours, and **2,700 human-annotated question-answer pairs**. Our work distinguishes from existing benchmarks through four key features: 37 | * *Duration in temporal dimension*. Encompassing both **short- (< 2min)**, **medium- (4min\~15min)**, and **long-term (30min\~60min)** videos, ranging from **11 seconds to 1 hour**, for robust contextual dynamics; 38 | * *Diversity in video types*. Spanning **6 primary visual domains**, i.e., Knowledge, Film & Television, Sports Competition, Life Record, and Multilingual, with **30 subfields** to ensure broad scenario generalizability; 39 | * *Breadth in data modalities*. Integrating multi-modal inputs besides video frames, including **subtitles and audios**, to assess the all-round capabilities of MLLMs; 40 | * *Quality in annotations*. **All data are newly collected and annotated by humans, not from any existing video dataset**, ensuring diversity and quality. 41 | 42 | 43 |

44 | 45 |

46 | 47 | ## 📐 Dataset Examples 48 | 49 |

50 | 51 |

52 | 53 |
54 |
55 | Click to expand more examples 56 |

57 | 58 | 59 | 60 |

61 |
62 | 63 | 64 | ## 🔍 Dataset 65 | 66 | **License**: 67 | ``` 68 | Video-MME is only used for academic research. Commercial use in any form is prohibited. 69 | The copyright of all videos belongs to the video owners. 70 | If there is any infringement in Video-MME, please email videomme2024@gmail.com and we will remove it immediately. 71 | Without prior approval, you cannot distribute, publish, copy, disseminate, or modify Video-MME in whole or in part. 72 | You must strictly comply with the above restrictions. 73 | ``` 74 | [Data](https://huggingface.co/datasets/lmms-lab/Video-MME/tree/main) and [Annotation](https://huggingface.co/datasets/lmms-lab/Video-MME) provided by LMMS-Lab, and [Evaluation Pipeline](https://github.com/MME-Benchmarks/Video-MME?tab=readme-ov-file#-evaluation-pipeline). 75 | 76 | You could choose to directly use tools like [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) and [LMMs-Eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) to evaluate your models on Video-MME. 77 | 78 | 79 | 80 | 81 | ## 🔮 Evaluation Pipeline 82 | 📍 **Extract Frames and Subtitles**: 83 | 84 | There are a total of **900 videos** and **744 subtitles**, where all long videos have subtitles. 85 | 86 | With respect to the setting of adding subtitles, you should only use the subtitles corresponding to the sampled video frames. 87 | For example, if you extract 10 frames per video for evaluation, take the 10 subtitles that corresponding to the time of those 10 frames. 88 | 89 | If you have already prepared the video and subtitle file, you could refer to [this script](https://github.com/look4u-ok/video-slicer) to extract the frames and corresponding subtitles. 90 | 91 | 92 | 📍 **Prompt**: 93 | 94 | The common prompt used in our evaluation follows this format: 95 | 96 | ``` 97 | This video's subtitles are listed below: 98 | [Subtitles] 99 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option. 100 | [Question] 101 | The best answer is: 102 | ``` 103 | 104 | For the subtitles-free setting, you should remove the subtitle content. 105 | 106 | 107 |
108 | Click to expand the prompt examples. 109 | 110 | * With subtitles: 111 | 112 | ``` 113 | This video's subtitles are listed below: 114 | Hi guys, I'm going to show you how to perfectly prepare a ... 115 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option. 116 | What is the color of the clothing worn by the persons in the video? 117 | A. Black. 118 | B. Gray. 119 | C. Green. 120 | D. Brown. 121 | The best answer is: 122 | ``` 123 | 124 | * Without subtitles: 125 | ``` 126 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option. 127 | What is the color of the clothing worn by the persons in the video? 128 | A. Black. 129 | B. Gray. 130 | C. Green. 131 | D. Brown. 132 | The best answer is: 133 | ``` 134 |
135 | 136 | 137 | 📍 **Evaluation**: 138 | 139 | To extract the answer and calculate the scores, we add the model response to a JSON file. Here we provide an example template [output_test_template.json](./evaluation/output_test_template.json). Once you have prepared the model responses in this format, please refer to the evaluation script [eval_your_results.py](https://github.com/thanku-all/parse_answer/blob/main/eval_your_results.py), and you will get the accuracy scores across video_durations, video domains, video subcategories, and task types. 140 | The evaluation does not introduce any third-party models, such as ChatGPT. 141 | 142 | ```bash 143 | python eval_your_results.py \ 144 | --results_file $YOUR_RESULTS_FILE \ 145 | --video_duration_type $VIDEO_DURATION_TYPE \ 146 | --return_categories_accuracy \ 147 | --return_sub_categories_accuracy \ 148 | --return_task_types_accuracy 149 | ``` 150 | Please ensure that the `results_file` follows the specified JSON format stated above, and `video_duration_type` is specified as either `short`, `medium`, or `long`. If you wish to assess results across various duration types, you can specify multiple types separated by commas or organize them in a list, for example: `short,medium,long` or `["short","medium","long"]`. 151 | 152 | 📍 **Leaderboard**: 153 | 154 | If you want to add your model to our [leaderboard](https://video-mme.github.io/home_page.html#leaderboard), please send model responses to **bradyfu24@gmail.com**, as the format of [output_test_template.json](./evaluation/output_test_template.json). 155 | 156 | 157 | ## 📈 Experimental Results 158 | - **Evaluation results of different MLLMs.** 159 | 160 |

161 | 162 |

163 | 164 | 165 | - **Evaluation results of different MLLMs across different task types.** 166 | 167 |

168 | 169 |

170 | 171 | - **Evaluation results of Gemini 1.5 Pro across different video duration types.** 172 | 173 |

174 | 175 |

176 | 177 | - **Evaluation results of Gemini 1.5 Pro across different video sub-types.** 178 | 179 |

180 | 181 |

182 | 183 | 184 | ## :black_nib: Citation 185 | 186 | If you find our work helpful for your research, please consider citing our work. 187 | 188 | ```bibtex 189 | @inproceedings{fu2025video, 190 | title={Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis}, 191 | author={Fu, Chaoyou and Dai, Yuhan and Luo, Yongdong and Li, Lei and Ren, Shuhuai and Zhang, Renrui and Wang, Zihan and Zhou, Chenyu and Shen, Yunhang and Zhang, Mengdan and others}, 192 | booktitle={CVPR}, 193 | year={2025} 194 | } 195 | 196 | @inproceedings{fu2025mme, 197 | title={Mme: A comprehensive evaluation benchmark for multimodal large language models}, 198 | author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and Sun, Xing and others}, 199 | booktitle={NeurIPS Datasets and Benchmarks Track}, 200 | year={2025} 201 | } 202 | 203 | @article{fu2024mme, 204 | title={MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs}, 205 | author={Fu, Chaoyou and Zhang, Yi-Fan and Yin, Shukang and Li, Bo and Fang, Xinyu and Zhao, Sirui and Duan, Haodong and Sun, Xing and Liu, Ziwei and Wang, Liang and others}, 206 | journal={arXiv preprint arXiv:2411.15296}, 207 | year={2024} 208 | } 209 | 210 | @article{zhang2024mme, 211 | title={MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?}, 212 | author={Zhang, Yi-Fan and Zhang, Huanyu and Tian, Haochen and Fu, Chaoyou and Zhang, Shuangqing and Wu, Junfei and Li, Feng and Wang, Kun and Wen, Qingsong and Zhang, Zhang and others}, 213 | journal={arXiv preprint arXiv:2408.13257}, 214 | year={2024} 215 | } 216 | ``` 217 | 218 | ## 📜 Related Works 219 | 220 | Explore our related researches: 221 | - **[MME-Survey]** [MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs](https://arxiv.org/pdf/2411.15296) 222 | - **[MME]** [MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models](https://arxiv.org/pdf/2306.13394) 223 | - **[MME-RealWorld]** [MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?](https://arxiv.org/pdf/2408.13257) 224 | - **[Awesome-MLLM]** [A Survey on Multimodal Large Language Models](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models) 225 | 226 | --------------------------------------------------------------------------------