├── asset
├── sta.jpg
├── name_logo.jpg
├── Highlights-1.png
├── Highlights-2.png
├── Highlights-3.png
├── Highlights-4.png
├── results_of_video_type.jpg
├── results_of_question_type.png
├── results_of_various_models.png
├── results_of_video_sub_type.png
└── results_of_question_types_0616.png
├── evaluation
└── output_test_template.json
└── README.md
/asset/sta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/sta.jpg
--------------------------------------------------------------------------------
/asset/name_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/name_logo.jpg
--------------------------------------------------------------------------------
/asset/Highlights-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-1.png
--------------------------------------------------------------------------------
/asset/Highlights-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-2.png
--------------------------------------------------------------------------------
/asset/Highlights-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-3.png
--------------------------------------------------------------------------------
/asset/Highlights-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/Highlights-4.png
--------------------------------------------------------------------------------
/asset/results_of_video_type.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_video_type.jpg
--------------------------------------------------------------------------------
/asset/results_of_question_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_question_type.png
--------------------------------------------------------------------------------
/asset/results_of_various_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_various_models.png
--------------------------------------------------------------------------------
/asset/results_of_video_sub_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_video_sub_type.png
--------------------------------------------------------------------------------
/asset/results_of_question_types_0616.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MME-Benchmarks/Video-MME/HEAD/asset/results_of_question_types_0616.png
--------------------------------------------------------------------------------
/evaluation/output_test_template.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "video_id": "001",
4 | "duration": "short",
5 | "domain": "Knowledge",
6 | "sub_category": "Humanity & History",
7 | "questions": [
8 | {
9 | "question_id": "001-1",
10 | "task_type": "Counting Problem",
11 | "question": "When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries, which kind of the decoration has the largest number?",
12 | "options": [
13 | "A. Apples.",
14 | "B. Candles.",
15 | "C. Berries.",
16 | "D. The three kinds are of the same number."
17 | ],
18 | "answer": "C",
19 | "response": "C. Berries.",
20 | },
21 | {
22 | "question_id": "001-2",
23 | "task_type": "Information Synopsis",
24 | "question": "What is the genre of this video?",
25 | "options": [
26 | "A. It is a news report that introduces the history behind Christmas decorations.",
27 | "B. It is a documentary on the evolution of Christmas holiday recipes.",
28 | "C. It is a travel vlog exploring Christmas markets around the world.",
29 | "D. It is a tutorial on DIY Christmas ornament crafting."
30 | ],
31 | "answer": "A",
32 | "response": "D.",
33 | },
34 | {
35 | "question_id": "001-3",
36 | "task_type": "Counting Problem",
37 | "question": "How many red socks are above the fireplace at the end of this video?",
38 | "options": [
39 | "A. 1.",
40 | "B. 4.",
41 | "C. 2.",
42 | "D. 3."
43 | ],
44 | "answer": "D",
45 | "response": "D. 3",
46 | }
47 | ]
48 | },
49 | {
50 | "video_id": "002",
51 | "duration": "short",
52 | "domain": "Knowledge",
53 | "sub_category": "Humanity & History",
54 | "questions": [
55 | {
56 | "question_id": "002-1",
57 | "task_type": "Object Recognition",
58 | "question": "Which of the following features/items is not discussed in the video in relation to the tomb?",
59 | "options": [
60 | "A. Inkstone.",
61 | "B. Niche.",
62 | "C. Jade.",
63 | "D. Sacrificial table."
64 | ],
65 | "answer": "C",
66 | "response": "Answer: C. Jade.",
67 | },
68 | {
69 | "question_id": "002-2",
70 | "task_type": "Action Reasoning",
71 | "question": "Which of the following reasons motivated the archaeologists to excavate the tomb?",
72 | "options": [
73 | "A. Because it's from Ming Dynasty and of specific archaeological significance.",
74 | "B. Because a new railway line will be built nearby.",
75 | "C. Because there were treasures inside the tomb.",
76 | "D. Highway realignment."
77 | ],
78 | "answer": "D",
79 | "response": "D",
80 | },
81 | {
82 | "question_id": "002-3",
83 | "task_type": "Counting Problem",
84 | "question": "How many porcelain jars were discovered in the niches located in the primary chamber of the tomb?",
85 | "options": [
86 | "A. 4.",
87 | "B. 9.",
88 | "C. 5.",
89 | "D. 13."
90 | ],
91 | "answer": "B",
92 | "response": "D. 13",
93 | }
94 | ]
95 | },
96 | {
97 | "video_id": "003",
98 | "duration": "short",
99 | "domain": "Knowledge",
100 | "sub_category": "Humanity & History",
101 | "questions": [
102 | {
103 | "question_id": "003-1",
104 | "task_type": "Counting Problem",
105 | "question": "How many national flags appear in the video?",
106 | "options": [
107 | "A. 3.",
108 | "B. 4.",
109 | "C. 2.",
110 | "D. 5."
111 | ],
112 | "answer": "B",
113 | "response": "B",
114 | },
115 | {
116 | "question_id": "003-2",
117 | "task_type": "Object Recognition",
118 | "question": "What is the video telling when the burger placed in the upper right corner at the end of the video first appears?",
119 | "options": [
120 | "A. Beef with spices came from Russia to Germany.",
121 | "B. The steak began to be sandwiched between two pieces of bread.",
122 | "C. Steak burgers spread throughout the United States.",
123 | "D. The standardization of hamburgers."
124 | ],
125 | "answer": "C",
126 | "response": "C.",
127 | },
128 | {
129 | "question_id": "003-3",
130 | "task_type": "Object Reasoning",
131 | "question": "In which country is the food featured in the video recognized worldwide?",
132 | "options": [
133 | "A. Mongolia.",
134 | "B. Russia.",
135 | "C. Germany.",
136 | "D. United States."
137 | ],
138 | "answer": "D",
139 | "response": "D. United States.",
140 | }
141 | ]
142 | },
143 | ]
144 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis
2 |
3 | 
4 | 
5 | 
6 | 
7 | 
8 | 
9 |
10 |
11 |
12 |
13 |
14 | [[🍎 Project Page](https://video-mme.github.io/)] [[📖 Paper](https://arxiv.org/pdf/2405.21075)] [[📊 Dataset](https://github.com/BradyFU/Video-MME?tab=readme-ov-file#-dataset)] [[📖 MME-Survey](https://arxiv.org/pdf/2411.15296)] [[🏆 Leaderboard](https://video-mme.github.io/home_page.html#leaderboard)]
15 |
16 | Video-MME applies to both **image MLLMs**, i.e., generalizing to multiple images, and **video MLLMs**. 🌟
17 |
18 | We are very proud to launch [**MME-Survey**](https://arxiv.org/pdf/2411.15296) (jointly introduced by **MME**, **MMBench**, and **LLaVA** teams), a comprehensive survey on evaluation of Multimodal LLMs! 🔥🔥
19 |
20 |
21 | ---
22 |
23 | ## 🔥 News
24 | * **`2025.12.05`** 🌟 [**Gemini 3 Pro**](https://blog.google/technology/developers/gemini-3-pro-vision/) has used our Video-MME as the benchmark of video.
25 | * **`2025.08.07`** 🌟 [**GPT-5**](https://openai.com/zh-Hans-CN/index/introducing-gpt-5-for-developers/) has used our Video-MME as the benchmark of multimodal long context.
26 | * **`2025.05.06`** 🌟 [**Gemini 2.5 Pro**](https://developers.googleblog.com/en/gemini-2-5-pro-io-improved-coding-performance/) has used our Video-MME as the benchmark of video understanding: "Gemini 2.5 Pro delivers state-of-the-art video understanding, scoring 84.8% on the VideoMME benchmark".
27 | * **`2025.04.14`** 🌟 Video-MME has been introduced and used by [**OpenAI GPT-4.1**](https://openai.com/index/gpt-4-1/) as an **"industry standard measure"** of multimodal long context ability.
28 | * **`2025.02.27`** 🌟 Video-MME has been accepted by CVPR 2025.
29 | * **`2024.06.15`** 🌟 We have refreshed our evaluation: 1) replace broken and potentially broken video links, and re-annotated them; 2) GPT-4o now samples 384 frames (previously 10 from the website) at 512x512 resolution, boosting overall accuracy to 71.9%.
30 | * **`2024.06.03`** 🌟 We are very proud to launch Video-MME, the first-ever comprehensive evaluation benchmark of MLLMs in Video Analysis!
31 |
32 |
33 |
34 | ## 👀 Video-MME Overview
35 |
36 | In the quest for artificial general intelligence, Multi-modal Large Language Models (MLLMs) have emerged as a focal point in recent advancements, but their potential in processing sequential visual data is still insufficiently explored. We introduce Video-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of MLLMs in Video analysis. It is designed to comprehensively assess the capabilities of MLLMs in processing video data, covering a wide range of visual domains, temporal durations, and data modalities. Video-MME comprises **900 videos** with a total of 254 hours, and **2,700 human-annotated question-answer pairs**. Our work distinguishes from existing benchmarks through four key features:
37 | * *Duration in temporal dimension*. Encompassing both **short- (< 2min)**, **medium- (4min\~15min)**, and **long-term (30min\~60min)** videos, ranging from **11 seconds to 1 hour**, for robust contextual dynamics;
38 | * *Diversity in video types*. Spanning **6 primary visual domains**, i.e., Knowledge, Film & Television, Sports Competition, Life Record, and Multilingual, with **30 subfields** to ensure broad scenario generalizability;
39 | * *Breadth in data modalities*. Integrating multi-modal inputs besides video frames, including **subtitles and audios**, to assess the all-round capabilities of MLLMs;
40 | * *Quality in annotations*. **All data are newly collected and annotated by humans, not from any existing video dataset**, ensuring diversity and quality.
41 |
42 |
43 |
44 |
45 |
46 |
47 | ## 📐 Dataset Examples
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | Click to expand more examples
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | ## 🔍 Dataset
65 |
66 | **License**:
67 | ```
68 | Video-MME is only used for academic research. Commercial use in any form is prohibited.
69 | The copyright of all videos belongs to the video owners.
70 | If there is any infringement in Video-MME, please email videomme2024@gmail.com and we will remove it immediately.
71 | Without prior approval, you cannot distribute, publish, copy, disseminate, or modify Video-MME in whole or in part.
72 | You must strictly comply with the above restrictions.
73 | ```
74 | [Data](https://huggingface.co/datasets/lmms-lab/Video-MME/tree/main) and [Annotation](https://huggingface.co/datasets/lmms-lab/Video-MME) provided by LMMS-Lab, and [Evaluation Pipeline](https://github.com/MME-Benchmarks/Video-MME?tab=readme-ov-file#-evaluation-pipeline).
75 |
76 | You could choose to directly use tools like [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) and [LMMs-Eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) to evaluate your models on Video-MME.
77 |
78 |
79 |
80 |
81 | ## 🔮 Evaluation Pipeline
82 | 📍 **Extract Frames and Subtitles**:
83 |
84 | There are a total of **900 videos** and **744 subtitles**, where all long videos have subtitles.
85 |
86 | With respect to the setting of adding subtitles, you should only use the subtitles corresponding to the sampled video frames.
87 | For example, if you extract 10 frames per video for evaluation, take the 10 subtitles that corresponding to the time of those 10 frames.
88 |
89 | If you have already prepared the video and subtitle file, you could refer to [this script](https://github.com/look4u-ok/video-slicer) to extract the frames and corresponding subtitles.
90 |
91 |
92 | 📍 **Prompt**:
93 |
94 | The common prompt used in our evaluation follows this format:
95 |
96 | ```
97 | This video's subtitles are listed below:
98 | [Subtitles]
99 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
100 | [Question]
101 | The best answer is:
102 | ```
103 |
104 | For the subtitles-free setting, you should remove the subtitle content.
105 |
106 |
107 |
108 | Click to expand the prompt examples.
109 |
110 | * With subtitles:
111 |
112 | ```
113 | This video's subtitles are listed below:
114 | Hi guys, I'm going to show you how to perfectly prepare a ...
115 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
116 | What is the color of the clothing worn by the persons in the video?
117 | A. Black.
118 | B. Gray.
119 | C. Green.
120 | D. Brown.
121 | The best answer is:
122 | ```
123 |
124 | * Without subtitles:
125 | ```
126 | Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
127 | What is the color of the clothing worn by the persons in the video?
128 | A. Black.
129 | B. Gray.
130 | C. Green.
131 | D. Brown.
132 | The best answer is:
133 | ```
134 |
135 |
136 |
137 | 📍 **Evaluation**:
138 |
139 | To extract the answer and calculate the scores, we add the model response to a JSON file. Here we provide an example template [output_test_template.json](./evaluation/output_test_template.json). Once you have prepared the model responses in this format, please refer to the evaluation script [eval_your_results.py](https://github.com/thanku-all/parse_answer/blob/main/eval_your_results.py), and you will get the accuracy scores across video_durations, video domains, video subcategories, and task types.
140 | The evaluation does not introduce any third-party models, such as ChatGPT.
141 |
142 | ```bash
143 | python eval_your_results.py \
144 | --results_file $YOUR_RESULTS_FILE \
145 | --video_duration_type $VIDEO_DURATION_TYPE \
146 | --return_categories_accuracy \
147 | --return_sub_categories_accuracy \
148 | --return_task_types_accuracy
149 | ```
150 | Please ensure that the `results_file` follows the specified JSON format stated above, and `video_duration_type` is specified as either `short`, `medium`, or `long`. If you wish to assess results across various duration types, you can specify multiple types separated by commas or organize them in a list, for example: `short,medium,long` or `["short","medium","long"]`.
151 |
152 | 📍 **Leaderboard**:
153 |
154 | If you want to add your model to our [leaderboard](https://video-mme.github.io/home_page.html#leaderboard), please send model responses to **bradyfu24@gmail.com**, as the format of [output_test_template.json](./evaluation/output_test_template.json).
155 |
156 |
157 | ## 📈 Experimental Results
158 | - **Evaluation results of different MLLMs.**
159 |
160 |
161 |
162 |
163 |
164 |
165 | - **Evaluation results of different MLLMs across different task types.**
166 |
167 |
168 |
169 |
170 |
171 | - **Evaluation results of Gemini 1.5 Pro across different video duration types.**
172 |
173 |
174 |
175 |
176 |
177 | - **Evaluation results of Gemini 1.5 Pro across different video sub-types.**
178 |
179 |
180 |
181 |
182 |
183 |
184 | ## :black_nib: Citation
185 |
186 | If you find our work helpful for your research, please consider citing our work.
187 |
188 | ```bibtex
189 | @inproceedings{fu2025video,
190 | title={Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis},
191 | author={Fu, Chaoyou and Dai, Yuhan and Luo, Yongdong and Li, Lei and Ren, Shuhuai and Zhang, Renrui and Wang, Zihan and Zhou, Chenyu and Shen, Yunhang and Zhang, Mengdan and others},
192 | booktitle={CVPR},
193 | year={2025}
194 | }
195 |
196 | @inproceedings{fu2025mme,
197 | title={Mme: A comprehensive evaluation benchmark for multimodal large language models},
198 | author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and Sun, Xing and others},
199 | booktitle={NeurIPS Datasets and Benchmarks Track},
200 | year={2025}
201 | }
202 |
203 | @article{fu2024mme,
204 | title={MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs},
205 | author={Fu, Chaoyou and Zhang, Yi-Fan and Yin, Shukang and Li, Bo and Fang, Xinyu and Zhao, Sirui and Duan, Haodong and Sun, Xing and Liu, Ziwei and Wang, Liang and others},
206 | journal={arXiv preprint arXiv:2411.15296},
207 | year={2024}
208 | }
209 |
210 | @article{zhang2024mme,
211 | title={MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?},
212 | author={Zhang, Yi-Fan and Zhang, Huanyu and Tian, Haochen and Fu, Chaoyou and Zhang, Shuangqing and Wu, Junfei and Li, Feng and Wang, Kun and Wen, Qingsong and Zhang, Zhang and others},
213 | journal={arXiv preprint arXiv:2408.13257},
214 | year={2024}
215 | }
216 | ```
217 |
218 | ## 📜 Related Works
219 |
220 | Explore our related researches:
221 | - **[MME-Survey]** [MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs](https://arxiv.org/pdf/2411.15296)
222 | - **[MME]** [MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models](https://arxiv.org/pdf/2306.13394)
223 | - **[MME-RealWorld]** [MME-RealWorld: Could Your Multimodal LLM Challenge High-Resolution Real-World Scenarios that are Difficult for Humans?](https://arxiv.org/pdf/2408.13257)
224 | - **[Awesome-MLLM]** [A Survey on Multimodal Large Language Models](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models)
225 |
226 |
--------------------------------------------------------------------------------