├── Eval_QA
    ├── ActivityNet_QA_new.json
    ├── Driving-decision-making_QA_new.json
    ├── Driving-exam_QA_new.json
    ├── MOT_QA.json
    ├── MOT_QA_new.json
    ├── MSRVTT_QA_new.json
    ├── MSVD_QA_new.json
    ├── MV_QA_new.json
    ├── NBA_QA_new.json
    ├── SQA3D_QA_new.json
    ├── TGIF_QA_new.json
    ├── TVQA_QA_new.json
    ├── Ucfcrime_QA_new.json
    └── Youcook2_QA_new.json
├── README.md
├── Step2_T5_judge.py
├── Step2_chatgpt_judge.py
├── Step3_merge_into_one_json.py
├── assets
    ├── 1
    ├── leaderboard2.png
    ├── logo.png
    ├── logo2.png
    └── pie_fig.jpg
└── requirements.txt


/Eval_QA/Driving-exam_QA_new.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "918600": {
  3 |     "vid_path": "Eval_video/Driving-exam/918600.mp4",
  4 |     "video_id": "918600",
  5 |     "question": "1/36. As shown in the video, how should you deal with this situation when driving a motor vehicle?",
  6 |     "choices": {
  7 |       "A": " Accelerate passing",
  8 |       "B": " Leave a lateral safety distance and drive slowly",
  9 |       "C": " Keep driving at normal speed",
 10 |       "D": " Emergency braking when approaching"
 11 |     }
 12 |   },
 13 |   "918700": {
 14 |     "vid_path": "Eval_video/Driving-exam/918700.mp4",
 15 |     "video_id": "918700",
 16 |     "question": "2/36. As shown in the video, how should you deal with this situation when driving a motor vehicle?",
 17 |     "choices": {
 18 |       "A": " Emergency braking",
 19 |       "B": " Turn sharply and bypass quickly",
 20 |       "C": " Stop smoothly",
 21 |       "D": " Avoid collision quickly"
 22 |     }
 23 |   },
 24 |   "919600": {
 25 |     "vid_path": "Eval_video/Driving-exam/919600.mp4",
 26 |     "video_id": "919600",
 27 |     "question": "3/36. As shown in the video, how should the car in front deal with this situation?",
 28 |     "choices": {
 29 |       "A": " Accelerate on the right side of the road",
 30 |       "B": " Give appropriate space and speed up",
 31 |       "C": " Rapid deceleration or emergency braking",
 32 |       "D": " Slow down and give way to the right side of the road."
 33 |     }
 34 |   },
 35 |   "921000": {
 36 |     "vid_path": "Eval_video/Driving-exam/921000.mp4",
 37 |     "video_id": "921000",
 38 |     "question": "4/36. As shown in the video, when overtaking on the road, the lateral distance should be increased as much as possible, and when necessary, you can cross the solid line to overtake. Is this statement correct?",
 39 |     "choices": {
 40 |       "A": " correct",
 41 |       "B": " error"
 42 |     }
 43 |   },
 44 |   "926100": {
 45 |     "vid_path": "Eval_video/Driving-exam/926100.mp4",
 46 |     "video_id": "926100",
 47 |     "question": "5/36. As shown in the video, how should I deal with this situation when driving a motor vehicle?",
 48 |     "choices": {
 49 |       "A": " Use the opposite lane to drive",
 50 |       "B": " Drive on the outside of the curve",
 51 |       "C": " Slow down sufficiently and drive on the right side",
 52 |       "D": " rapid braking and low speed passing"
 53 |     }
 54 |   },
 55 |   "927300": {
 56 |     "vid_path": "Eval_video/Driving-exam/927300.mp4",
 57 |     "video_id": "927300",
 58 |     "question": "6/36. As shown in the video, how should I deal with this situation when driving a motor vehicle?",
 59 |     "choices": {
 60 |       "A": " Slow down or stop and give way",
 61 |       "B": " Change lanes immediately to bypass pedestrians",
 62 |       "C": " Honk the horn to signal them to give way",
 63 |       "D": " Pass before pedestrians"
 64 |     }
 65 |   },
 66 |   "929800": {
 67 |     "vid_path": "Eval_video/Driving-exam/929800.mp4",
 68 |     "video_id": "929800",
 69 |     "question": "7/36, as shown in the video, how should I deal with this situation when driving a motor vehicle?",
 70 |     "choices": {
 71 |       "A": " pay attention to observe the dynamics of pedestrians and non-motorized vehicles before passing",
 72 |       "B": " Accelerate passing",
 73 |       "C": " Stop immediately",
 74 |       "D": " honk the horn to signal pedestrians to give way"
 75 |     }
 76 |   },
 77 |   "954500": {
 78 |     "vid_path": "Eval_video/Driving-exam/954500.mp4",
 79 |     "video_id": "954500",
 80 |     "question": "8/36. As shown in the video, is the driver's behavior correct?",
 81 |     "choices": {
 82 |       "A": " correct",
 83 |       "B": " error"
 84 |     }
 85 |   },
 86 |   "928200": {
 87 |     "vid_path": "Eval_video/Driving-exam/928200.mp4",
 88 |     "video_id": "928200",
 89 |     "question": "9/36, as shown in the video, the driver's behavior is correct. Is this statement correct?",
 90 |     "choices": {
 91 |       "A": " correct",
 92 |       "B": " error"
 93 |     }
 94 |   },
 95 |   "929500": {
 96 |     "vid_path": "Eval_video/Driving-exam/929500.mp4",
 97 |     "video_id": "929500",
 98 |     "question": "10/36. As shown in the video, it is correct for motor vehicles to pass through railway crossings. Is this statement correct?",
 99 |     "choices": {
100 |       "A": " correct",
101 |       "B": " error"
102 |     }
103 |   },
104 |   "955800": {
105 |     "vid_path": "Eval_video/Driving-exam/955800.mp4",
106 |     "video_id": "955800",
107 |     "question": "11/36. As shown in the video, what should the driver do?",
108 |     "choices": {
109 |       "A": " honk to urge",
110 |       "B": " slow down and stop, wait in line one by one",
111 |       "C": " Look for opportunities to overtake the car in front",
112 |       "D": " passing through the vehicle space"
113 |     }
114 |   },
115 |   "963200": {
116 |     "vid_path": "Eval_video/Driving-exam/963200.mp4",
117 |     "video_id": "963200",
118 |     "question": "12/36. Vehicles should 'slow down, honk, and keep to the right' when passing a curve on a mountainous road. Is this statement correct?",
119 |     "choices": {
120 |       "A": " correct",
121 |       "B": " error"
122 |     }
123 |   },
124 |   "972400": {
125 |     "vid_path": "Eval_video/Driving-exam/972400.mp4",
126 |     "video_id": "972400",
127 |     "question": "13/36. What should drivers do to ensure safety when driving through waterlogged roads?",
128 |     "choices": {
129 |       "A": " pay special attention to slow down",
130 |       "B": " quickly accelerate to pass",
131 |       "C": " accelerate through in low gear",
132 |       "D": " pass at normal speed"
133 |     }
134 |   },
135 |   "975300": {
136 |     "vid_path": "Eval_video/Driving-exam/975300.mp4",
137 |     "video_id": "975300",
138 |     "question": "14/36. As shown in the video, is the driving behavior of car A correct when the motor vehicle is driving on the highway?",
139 |     "choices": {
140 |       "A": " correct",
141 |       "B": " error"
142 |     }
143 |   },
144 |   "975400": {
145 |     "vid_path": "Eval_video/Driving-exam/975400.mp4",
146 |     "video_id": "975400",
147 |     "question": "15/36. As shown in the video, when the expressway is blocked due to an accident, is the driving behavior of car A correct?",
148 |     "choices": {
149 |       "A": " correct",
150 |       "B": " error"
151 |     }
152 |   },
153 |   "909800": {
154 |     "vid_path": "Eval_video/Driving-exam/909800.mp4",
155 |     "video_id": "909800",
156 |     "question": "16/36, how many illegal acts are there in video 1?",
157 |     "choices": {
158 |       "A": " Three illegal acts",
159 |       "B": " Four illegal acts",
160 |       "C": " Two illegal acts",
161 |       "D": " An illegal act"
162 |     }
163 |   },
164 |   "909900": {
165 |     "vid_path": "Eval_video/Driving-exam/909900.mp4",
166 |     "video_id": "909900",
167 |     "question": "17/36, how many illegal acts are there in video 2?",
168 |     "choices": {
169 |       "A": " Four illegal acts",
170 |       "B": " Three illegal acts",
171 |       "C": " Two illegal acts",
172 |       "D": " An illegal act"
173 |     }
174 |   },
175 |   "910000": {
176 |     "vid_path": "Eval_video/Driving-exam/910000.mp4",
177 |     "video_id": "910000",
178 |     "question": "18/36, how many illegal acts are there in video 3?",
179 |     "choices": {
180 |       "A": " Three illegal acts",
181 |       "B": " Four illegal acts",
182 |       "C": " Two illegal acts",
183 |       "D": " An illegal act"
184 |     }
185 |   },
186 |   "910100": {
187 |     "vid_path": "Eval_video/Driving-exam/910100.mp4",
188 |     "video_id": "910100",
189 |     "question": "19/36, how many illegal acts are there in video 4?",
190 |     "choices": {
191 |       "A": " An illegal act",
192 |       "B": " Two illegal acts",
193 |       "C": " Three illegal acts",
194 |       "D": " Four illegal acts"
195 |     }
196 |   },
197 |   "910200": {
198 |     "vid_path": "Eval_video/Driving-exam/910200.mp4",
199 |     "video_id": "910200",
200 |     "question": "20/36, how many illegal acts are there in video 5?",
201 |     "choices": {
202 |       "A": " Two illegal acts",
203 |       "B": " An illegal act",
204 |       "C": " Four illegal acts",
205 |       "D": " Three illegal acts"
206 |     }
207 |   },
208 |   "910300": {
209 |     "vid_path": "Eval_video/Driving-exam/910300.mp4",
210 |     "video_id": "910300",
211 |     "question": "21/36, how many illegal acts are there in video 6?",
212 |     "choices": {
213 |       "A": " Three illegal acts",
214 |       "B": " Four illegal acts",
215 |       "C": " Two illegal acts",
216 |       "D": " An illegal act"
217 |     }
218 |   },
219 |   "910400": {
220 |     "vid_path": "Eval_video/Driving-exam/910400.mp4",
221 |     "video_id": "910400",
222 |     "question": "22/36, how many illegal acts are there in video 7?",
223 |     "choices": {
224 |       "A": " Four illegal acts",
225 |       "B": " Two illegal acts",
226 |       "C": " An illegal act",
227 |       "D": " Three illegal acts"
228 |     }
229 |   },
230 |   "910500": {
231 |     "vid_path": "Eval_video/Driving-exam/910500.mp4",
232 |     "video_id": "910500",
233 |     "question": "23/36, how many illegal acts are there in video 8?",
234 |     "choices": {
235 |       "A": " Four illegal acts",
236 |       "B": " Three illegal acts",
237 |       "C": " Two illegal acts",
238 |       "D": " An illegal act"
239 |     }
240 |   },
241 |   "13497300": {
242 |     "vid_path": "Eval_video/Driving-exam/13497300.mp4",
243 |     "video_id": "13497300",
244 |     "question": "24/36, as shown in the video, which lane in the picture is the correct way to pass the crosswalk?",
245 |     "choices": {
246 |       "A": "\u2461\u2462",
247 |       "B": "\u2460\u2461\u2462",
248 |       "C": "\u2460\u2461",
249 |       "D": "\u2460\u2462"
250 |     }
251 |   },
252 |   "13497700": {
253 |     "vid_path": "Eval_video/Driving-exam/13497700.mp4",
254 |     "video_id": "13497700",
255 |     "question": "25/36. As shown in the video, is the driver's behavior correct?",
256 |     "choices": {
257 |       "A": " correct",
258 |       "B": " error"
259 |     }
260 |   },
261 |   "13497900": {
262 |     "vid_path": "Eval_video/Driving-exam/13497900.mp4",
263 |     "video_id": "13497900",
264 |     "question": "26/36. As shown in the video, is it correct for motor vehicles to cross the pedestrian crossing?",
265 |     "choices": {
266 |       "A": " correct",
267 |       "B": " error"
268 |     }
269 |   },
270 |   "13498200": {
271 |     "vid_path": "Eval_video/Driving-exam/13498200.mp4",
272 |     "video_id": "13498200",
273 |     "question": "27/36. As shown in the video, it is wrong for motor vehicles to pass through the crosswalk. Is this statement correct?",
274 |     "choices": {
275 |       "A": " correct",
276 |       "B": " error"
277 |     }
278 |   },
279 |   "13498300": {
280 |     "vid_path": "Eval_video/Driving-exam/13498300.mp4",
281 |     "video_id": "13498300",
282 |     "question": "28/36. As shown in the video, how should motor vehicles drive through this intersection?",
283 |     "choices": {
284 |       "A": " Pass quickly while honking the horn",
285 |       "B": " slow down",
286 |       "C": " accelerate pass",
287 |       "D": " stop and give way"
288 |     }
289 |   },
290 |   "13498400": {
291 |     "vid_path": "Eval_video/Driving-exam/13498400.mp4",
292 |     "video_id": "13498400",
293 |     "question": "29/36. As shown in the video, is it correct for the motor vehicle to turn left at the intersection? Is this statement correct?",
294 |     "choices": {
295 |       "A": " correct",
296 |       "B": " error"
297 |     }
298 |   },
299 |   "13498600": {
300 |     "vid_path": "Eval_video/Driving-exam/13498600.mp4",
301 |     "video_id": "13498600",
302 |     "question": "30/36. As shown in the video, it is wrong for motor vehicles to pass through pedestrian crossings. Is this statement correct?",
303 |     "choices": {
304 |       "A": " correct",
305 |       "B": " error"
306 |     }
307 |   },
308 |   "13498700": {
309 |     "vid_path": "Eval_video/Driving-exam/13498700.mp4",
310 |     "video_id": "13498700",
311 |     "question": "31/36. As shown in the video, is the driving behavior of the motor vehicle correct?",
312 |     "choices": {
313 |       "A": " correct",
314 |       "B": " error"
315 |     }
316 |   },
317 |   "13498800": {
318 |     "vid_path": "Eval_video/Driving-exam/13498800.mp4",
319 |     "video_id": "13498800",
320 |     "question": "32/36. As shown in the video, is the driving behavior of AB and AB correct?",
321 |     "choices": {
322 |       "A": " A is wrong, B is correct",
323 |       "B": " A is correct, B is wrong",
324 |       "C": "C and AB are both wrong",
325 |       "D": "D and AB are both correct"
326 |     }
327 |   },
328 |   "13499000": {
329 |     "vid_path": "Eval_video/Driving-exam/13499000.mp4",
330 |     "video_id": "13499000",
331 |     "question": "33/36. As shown in the video, motor vehicles did not affect normal pedestrian traffic when passing the crosswalk, so it is the correct approach. Is this statement correct?",
332 |     "choices": {
333 |       "A": " correct",
334 |       "B": " error"
335 |     }
336 |   },
337 |   "13499200": {
338 |     "vid_path": "Eval_video/Driving-exam/13499200.mp4",
339 |     "video_id": "13499200",
340 |     "question": "34/36. When a motor vehicle is passing a crosswalk, the driver may stop and wave to signal for pedestrians to pass first. Is this statement correct?",
341 |     "choices": {
342 |       "A": " correct",
343 |       "B": " error"
344 |     }
345 |   },
346 |   "13499400": {
347 |     "vid_path": "Eval_video/Driving-exam/13499400.mp4",
348 |     "video_id": "13499400",
349 |     "question": "35/36. As shown in the video, the behavior of pedestrians crossing the zebra crossing in the video is uncivilized.",
350 |     "choices": {
351 |       "A": " correct",
352 |       "B": " error"
353 |     }
354 |   },
355 |   "13499600": {
356 |     "vid_path": "Eval_video/Driving-exam/13499600.mp4",
357 |     "video_id": "13499600",
358 |     "question": "36/36. As shown in the video, is the behavior of pedestrians and motor vehicles correct?",
359 |     "choices": {
360 |       "A": " correct",
361 |       "B": " error"
362 |     }
363 |   }
364 | }


--------------------------------------------------------------------------------
/Eval_QA/MOT_QA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "0": {
  3 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-02.mp4",
  4 |     "video_id": "MOT16-02",
  5 |     "question": "How many distinct pedestrians are visible in the video?",
  6 |     "choices": {
  7 |       "A": "81",
  8 |       "B": "37",
  9 |       "C": "18",
 10 |       "D": "100",
 11 |       "E": "126",
 12 |       "F": "63"
 13 |     },
 14 |     "answer": "F"
 15 |   },
 16 |   "1": {
 17 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-04.mp4",
 18 |     "video_id": "MOT16-04",
 19 |     "question": "In the video, how many pedestrians can be recognized as different individuals?",
 20 |     "choices": {
 21 |       "A": "114",
 22 |       "B": "176",
 23 |       "C": "52",
 24 |       "D": "26",
 25 |       "E": "88",
 26 |       "F": "140"
 27 |     },
 28 |     "answer": "E"
 29 |   },
 30 |   "2": {
 31 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-05.mp4",
 32 |     "video_id": "MOT16-05",
 33 |     "question": "In the video, how many distinct individuals can be seen walking?",
 34 |     "choices": {
 35 |       "A": "200",
 36 |       "B": "162",
 37 |       "C": "37",
 38 |       "D": "125",
 39 |       "E": "250",
 40 |       "F": "75"
 41 |     },
 42 |     "answer": "D"
 43 |   },
 44 |   "3": {
 45 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-09.mp4",
 46 |     "video_id": "MOT16-09",
 47 |     "question": "In the video, how many pedestrians can be identified as separate individuals?",
 48 |     "choices": {
 49 |       "A": "15",
 50 |       "B": "32",
 51 |       "C": "25",
 52 |       "D": "7",
 53 |       "E": "50",
 54 |       "F": "40"
 55 |     },
 56 |     "answer": "C"
 57 |   },
 58 |   "4": {
 59 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-10.mp4",
 60 |     "video_id": "MOT16-10",
 61 |     "question": "What is the number of different individuals seen walking in the video?",
 62 |     "choices": {
 63 |       "A": "17",
 64 |       "B": "57",
 65 |       "C": "34",
 66 |       "D": "74",
 67 |       "E": "114",
 68 |       "F": "91"
 69 |     },
 70 |     "answer": "B"
 71 |   },
 72 |   "5": {
 73 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-11.mp4",
 74 |     "video_id": "MOT16-11",
 75 |     "question": "How many different people can be spotted as pedestrians in the video?",
 76 |     "choices": {
 77 |       "A": "41",
 78 |       "B": "89",
 79 |       "C": "20",
 80 |       "D": "138",
 81 |       "E": "110",
 82 |       "F": "69"
 83 |     },
 84 |     "answer": "F"
 85 |   },
 86 |   "6": {
 87 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT16_videos/MOT16-13.mp4",
 88 |     "video_id": "MOT16-13",
 89 |     "question": "What is the number of different individuals seen walking in the video?",
 90 |     "choices": {
 91 |       "A": "139",
 92 |       "B": "64",
 93 |       "C": "214",
 94 |       "D": "171",
 95 |       "E": "32",
 96 |       "F": "107"
 97 |     },
 98 |     "answer": "F"
 99 |   },
100 |   "7": {
101 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-02-FRCNN.mp4",
102 |     "video_id": "MOT17-02-FRCNN",
103 |     "question": "How many distinct pedestrians are visible in the video?",
104 |     "choices": {
105 |       "A": "21",
106 |       "B": "42",
107 |       "C": "113",
108 |       "D": "142",
109 |       "E": "71",
110 |       "F": "92"
111 |     },
112 |     "answer": "E"
113 |   },
114 |   "8": {
115 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-04-FRCNN.mp4",
116 |     "video_id": "MOT17-04-FRCNN",
117 |     "question": "How many distinct pedestrians are visible in the video?",
118 |     "choices": {
119 |       "A": "176",
120 |       "B": "52",
121 |       "C": "26",
122 |       "D": "140",
123 |       "E": "88",
124 |       "F": "114"
125 |     },
126 |     "answer": "E"
127 |   },
128 |   "9": {
129 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-05-FRCNN.mp4",
130 |     "video_id": "MOT17-05-FRCNN",
131 |     "question": "What is the total count of different pedestrians in the video?",
132 |     "choices": {
133 |       "A": "172",
134 |       "B": "133",
135 |       "C": "39",
136 |       "D": "266",
137 |       "E": "79",
138 |       "F": "212"
139 |     },
140 |     "answer": "B"
141 |   },
142 |   "10": {
143 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-09-FRCNN.mp4",
144 |     "video_id": "MOT17-09-FRCNN",
145 |     "question": "What is the number of different individuals seen walking in the video?",
146 |     "choices": {
147 |       "A": "35",
148 |       "B": "54",
149 |       "C": "43",
150 |       "D": "16",
151 |       "E": "27",
152 |       "F": "8"
153 |     },
154 |     "answer": "E"
155 |   },
156 |   "11": {
157 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-10-FRCNN.mp4",
158 |     "video_id": "MOT17-10-FRCNN",
159 |     "question": "How many different people can be spotted as pedestrians in the video?",
160 |     "choices": {
161 |       "A": "60",
162 |       "B": "120",
163 |       "C": "96",
164 |       "D": "36",
165 |       "E": "18",
166 |       "F": "78"
167 |     },
168 |     "answer": "A"
169 |   },
170 |   "12": {
171 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-11-FRCNN.mp4",
172 |     "video_id": "MOT17-11-FRCNN",
173 |     "question": "How many individuals walking can be distinguished in the video?",
174 |     "choices": {
175 |       "A": "76",
176 |       "B": "121",
177 |       "C": "98",
178 |       "D": "152",
179 |       "E": "22",
180 |       "F": "45"
181 |     },
182 |     "answer": "A"
183 |   },
184 |   "13": {
185 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT17_videos/MOT17-13-FRCNN.mp4",
186 |     "video_id": "MOT17-13-FRCNN",
187 |     "question": "How many distinct pedestrians are present in the video?",
188 |     "choices": {
189 |       "A": "176",
190 |       "B": "220",
191 |       "C": "110",
192 |       "D": "143",
193 |       "E": "66",
194 |       "F": "33"
195 |     },
196 |     "answer": "C"
197 |   },
198 |   "14": {
199 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT20_videos/MOT20-01.mp4",
200 |     "video_id": "MOT20-01",
201 |     "question": "How many separate pedestrians are there in the video?",
202 |     "choices": {
203 |       "A": "80",
204 |       "B": "160",
205 |       "C": "128",
206 |       "D": "104",
207 |       "E": "24",
208 |       "F": "48"
209 |     },
210 |     "answer": "A"
211 |   },
212 |   "15": {
213 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT20_videos/MOT20-02.mp4",
214 |     "video_id": "MOT20-02",
215 |     "question": "What is the total number of distinct pedestrians in the video?",
216 |     "choices": {
217 |       "A": "274",
218 |       "B": "82",
219 |       "C": "164",
220 |       "D": "438",
221 |       "E": "548",
222 |       "F": "356"
223 |     },
224 |     "answer": "A"
225 |   },
226 |   "16": {
227 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT20_videos/MOT20-03.mp4",
228 |     "video_id": "MOT20-03",
229 |     "question": "How many different people can be spotted as pedestrians in the video?",
230 |     "choices": {
231 |       "A": "434",
232 |       "B": "1448",
233 |       "C": "217",
234 |       "D": "1158",
235 |       "E": "941",
236 |       "F": "724"
237 |     },
238 |     "answer": "F"
239 |   },
240 |   "17": {
241 |     "vid_path": "/remote-home/share/VideoBenchmark/videos/MOT_video/MOT20_videos/MOT20-05.mp4",
242 |     "video_id": "MOT20-05",
243 |     "question": "What is the quantity of different pedestrians present in the video?",
244 |     "choices": {
245 |       "A": "1560",
246 |       "B": "1200",
247 |       "C": "1920",
248 |       "D": "360",
249 |       "E": "720",
250 |       "F": "2400"
251 |     },
252 |     "answer": "B"
253 |   }
254 | }


--------------------------------------------------------------------------------
/Eval_QA/MOT_QA_new.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "0": {
  3 |     "vid_path": "Eval_video/MOT/MOT16-02.mp4",
  4 |     "video_id": "MOT16-02",
  5 |     "question": "How many distinct pedestrians are visible in the video?",
  6 |     "choices": {
  7 |       "A": "81",
  8 |       "B": "37",
  9 |       "C": "18",
 10 |       "D": "100",
 11 |       "E": "126",
 12 |       "F": "63"
 13 |     }
 14 |   },
 15 |   "1": {
 16 |     "vid_path": "Eval_video/MOT/MOT16-04.mp4",
 17 |     "video_id": "MOT16-04",
 18 |     "question": "In the video, how many pedestrians can be recognized as different individuals?",
 19 |     "choices": {
 20 |       "A": "114",
 21 |       "B": "176",
 22 |       "C": "52",
 23 |       "D": "26",
 24 |       "E": "88",
 25 |       "F": "140"
 26 |     }
 27 |   },
 28 |   "2": {
 29 |     "vid_path": "Eval_video/MOT/MOT16-05.mp4",
 30 |     "video_id": "MOT16-05",
 31 |     "question": "In the video, how many distinct individuals can be seen walking?",
 32 |     "choices": {
 33 |       "A": "200",
 34 |       "B": "162",
 35 |       "C": "37",
 36 |       "D": "125",
 37 |       "E": "250",
 38 |       "F": "75"
 39 |     }
 40 |   },
 41 |   "3": {
 42 |     "vid_path": "Eval_video/MOT/MOT16-09.mp4",
 43 |     "video_id": "MOT16-09",
 44 |     "question": "In the video, how many pedestrians can be identified as separate individuals?",
 45 |     "choices": {
 46 |       "A": "15",
 47 |       "B": "32",
 48 |       "C": "25",
 49 |       "D": "7",
 50 |       "E": "50",
 51 |       "F": "40"
 52 |     }
 53 |   },
 54 |   "4": {
 55 |     "vid_path": "Eval_video/MOT/MOT16-10.mp4",
 56 |     "video_id": "MOT16-10",
 57 |     "question": "What is the number of different individuals seen walking in the video?",
 58 |     "choices": {
 59 |       "A": "17",
 60 |       "B": "57",
 61 |       "C": "34",
 62 |       "D": "74",
 63 |       "E": "114",
 64 |       "F": "91"
 65 |     }
 66 |   },
 67 |   "5": {
 68 |     "vid_path": "Eval_video/MOT/MOT16-11.mp4",
 69 |     "video_id": "MOT16-11",
 70 |     "question": "How many different people can be spotted as pedestrians in the video?",
 71 |     "choices": {
 72 |       "A": "41",
 73 |       "B": "89",
 74 |       "C": "20",
 75 |       "D": "138",
 76 |       "E": "110",
 77 |       "F": "69"
 78 |     }
 79 |   },
 80 |   "6": {
 81 |     "vid_path": "Eval_video/MOT/MOT16-13.mp4",
 82 |     "video_id": "MOT16-13",
 83 |     "question": "What is the number of different individuals seen walking in the video?",
 84 |     "choices": {
 85 |       "A": "139",
 86 |       "B": "64",
 87 |       "C": "214",
 88 |       "D": "171",
 89 |       "E": "32",
 90 |       "F": "107"
 91 |     }
 92 |   },
 93 |   "7": {
 94 |     "vid_path": "Eval_video/MOT/MOT17-02-FRCNN.mp4",
 95 |     "video_id": "MOT17-02-FRCNN",
 96 |     "question": "How many distinct pedestrians are visible in the video?",
 97 |     "choices": {
 98 |       "A": "21",
 99 |       "B": "42",
100 |       "C": "113",
101 |       "D": "142",
102 |       "E": "71",
103 |       "F": "92"
104 |     }
105 |   },
106 |   "8": {
107 |     "vid_path": "Eval_video/MOT/MOT17-04-FRCNN.mp4",
108 |     "video_id": "MOT17-04-FRCNN",
109 |     "question": "How many distinct pedestrians are visible in the video?",
110 |     "choices": {
111 |       "A": "176",
112 |       "B": "52",
113 |       "C": "26",
114 |       "D": "140",
115 |       "E": "88",
116 |       "F": "114"
117 |     }
118 |   },
119 |   "9": {
120 |     "vid_path": "Eval_video/MOT/MOT17-05-FRCNN.mp4",
121 |     "video_id": "MOT17-05-FRCNN",
122 |     "question": "What is the total count of different pedestrians in the video?",
123 |     "choices": {
124 |       "A": "172",
125 |       "B": "133",
126 |       "C": "39",
127 |       "D": "266",
128 |       "E": "79",
129 |       "F": "212"
130 |     }
131 |   },
132 |   "10": {
133 |     "vid_path": "Eval_video/MOT/MOT17-09-FRCNN.mp4",
134 |     "video_id": "MOT17-09-FRCNN",
135 |     "question": "What is the number of different individuals seen walking in the video?",
136 |     "choices": {
137 |       "A": "35",
138 |       "B": "54",
139 |       "C": "43",
140 |       "D": "16",
141 |       "E": "27",
142 |       "F": "8"
143 |     }
144 |   },
145 |   "11": {
146 |     "vid_path": "Eval_video/MOT/MOT17-10-FRCNN.mp4",
147 |     "video_id": "MOT17-10-FRCNN",
148 |     "question": "How many different people can be spotted as pedestrians in the video?",
149 |     "choices": {
150 |       "A": "60",
151 |       "B": "120",
152 |       "C": "96",
153 |       "D": "36",
154 |       "E": "18",
155 |       "F": "78"
156 |     }
157 |   },
158 |   "12": {
159 |     "vid_path": "Eval_video/MOT/MOT17-11-FRCNN.mp4",
160 |     "video_id": "MOT17-11-FRCNN",
161 |     "question": "How many individuals walking can be distinguished in the video?",
162 |     "choices": {
163 |       "A": "76",
164 |       "B": "121",
165 |       "C": "98",
166 |       "D": "152",
167 |       "E": "22",
168 |       "F": "45"
169 |     }
170 |   },
171 |   "13": {
172 |     "vid_path": "Eval_video/MOT/MOT17-13-FRCNN.mp4",
173 |     "video_id": "MOT17-13-FRCNN",
174 |     "question": "How many distinct pedestrians are present in the video?",
175 |     "choices": {
176 |       "A": "176",
177 |       "B": "220",
178 |       "C": "110",
179 |       "D": "143",
180 |       "E": "66",
181 |       "F": "33"
182 |     }
183 |   },
184 |   "14": {
185 |     "vid_path": "Eval_video/MOT/MOT20-01.mp4",
186 |     "video_id": "MOT20-01",
187 |     "question": "How many separate pedestrians are there in the video?",
188 |     "choices": {
189 |       "A": "80",
190 |       "B": "160",
191 |       "C": "128",
192 |       "D": "104",
193 |       "E": "24",
194 |       "F": "48"
195 |     }
196 |   },
197 |   "15": {
198 |     "vid_path": "Eval_video/MOT/MOT20-02.mp4",
199 |     "video_id": "MOT20-02",
200 |     "question": "What is the total number of distinct pedestrians in the video?",
201 |     "choices": {
202 |       "A": "274",
203 |       "B": "82",
204 |       "C": "164",
205 |       "D": "438",
206 |       "E": "548",
207 |       "F": "356"
208 |     }
209 |   },
210 |   "16": {
211 |     "vid_path": "Eval_video/MOT/MOT20-03.mp4",
212 |     "video_id": "MOT20-03",
213 |     "question": "How many different people can be spotted as pedestrians in the video?",
214 |     "choices": {
215 |       "A": "434",
216 |       "B": "1448",
217 |       "C": "217",
218 |       "D": "1158",
219 |       "E": "941",
220 |       "F": "724"
221 |     }
222 |   },
223 |   "17": {
224 |     "vid_path": "Eval_video/MOT/MOT20-05.mp4",
225 |     "video_id": "MOT20-05",
226 |     "question": "What is the quantity of different pedestrians present in the video?",
227 |     "choices": {
228 |       "A": "1560",
229 |       "B": "1200",
230 |       "C": "1920",
231 |       "D": "360",
232 |       "E": "720",
233 |       "F": "2400"
234 |     }
235 |   }
236 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <p align="center">
  4 |     <img src="assets/logo.png" width="300" style="margin-bottom: 0.2;"/>
  5 | <p> 
  6 | <h2 align="center"> <a href="https://arxiv.org/abs/2311.16103">Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models</a></h2>
  7 | <h5 align="center"> If you like our project, please give us a star ⭐ on GitHub for latest update.  </h2>
  8 | 
  9 |  
 10 | <h5 align="center">
 11 |     
 12 | [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Video-Bench)
 13 | [![arXiv](https://img.shields.io/badge/Arxiv-2311.10122-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16103)
 14 | [![License](https://img.shields.io/badge/License-Apache%202.0-yellow)](https://github.com/PKU-YuanGroup/Video-Bench/blob/main/LICENSE) 
 15 | [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FPKU-YuanGroup%2FVideo-Bench&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visitor&edge_flat=false)](https://hits.seeyoufarm.com)
 16 | [![GitHub issues](https://img.shields.io/github/issues/PKU-YuanGroup/Video-Bench?color=critical&label=Issues)](https://github.com/PKU-YuanGroup/Video-Bench/issues?q=is%3Aopen+is%3Aissue)
 17 | [![GitHub closed issues](https://img.shields.io/github/issues-closed/PKU-YuanGroup/Video-Bench?color=success&label=Issues)](https://github.com/PKU-YuanGroup/Video-Bench/issues?q=is%3Aissue+is%3Aclosed)  <br>
 18 | <!--[![zhihu](https://img.shields.io/badge/-Bilibili-000000?logo=bilibili&logoColor=00A1D6)](https://zhuanlan.zhihu.com/p/668166885)-->
 19 | 
 20 | </h5>
 21 |  
 22 | 
 23 | * **We introduce Video-Bench, the first comprehensive evaluation benchmark for Video-LLMs, featuring a three-level ability assessment that systematically evaluates models in video-exclusive understanding, prior knowledge incorporation, and video-based decision-making abilities.**
 24 | * **We provide a user-friendly evaluation toolkit. Accompanied by our datasets and QA pairs, the toolkit can streamline the performance assessment of Video-LLMs.**
 25 | * **We conduct extensive experiments to evaluate prominent Video-LLMs, summarizing their behaviors, analyzing main causes for observed limitations, and proposing future directions for improvement.**
 26 | 
 27 | 
 28 | 
 29 | <details close><summary>💡 I also have other video-language projects that may interest you ✨. </summary><p>
 30 | 
 31 | 
 32 | > [**LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**](https://arxiv.org/abs/2310.01852) <br>
 33 | > Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, Wancai Zhang, Zhifeng Li, Wei Liu, Li Yuan <br>
 34 | [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/LanguageBind) [![arXiv](https://img.shields.io/badge/Arxiv-2310.01852-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2310.01852) <br>
 35 | 
 36 | > [**Video-LLaVA: Learning United Visual Representation by Alignment Before Projection**](https://arxiv.org/abs/2311.10122) <br>
 37 | > Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan <br>
 38 | [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/Video-LLaVA) [![arXiv](https://img.shields.io/badge/Arxiv-2311.16103-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.16103) <br>
 39 | 
 40 | > [**Chat-UniVi: Unified Visual Representation Empowers Large Language Models with Image and Video Understanding**](https://arxiv.org/abs/2311.08046) <br>
 41 | > Peng Jin, Ryuichi Takanobu, Caiwan Zhang, Xiaochun Cao, Li Yuan <br>
 42 | [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/PKU-YuanGroup/Chat-UniVi) [![arXiv](https://img.shields.io/badge/Arxiv-2311.08046-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2311.08046) <br>
 43 | </p></details>
 44 | 
 45 | 
 46 | 
 47 | ## 📰 News
 48 | **[2023.12.31]** We have updated the performance of the **Sphinx-V2** model on the [Video Bench LeaderBoard](https://huggingface.co/spaces/LanguageBind/Video-Bench), significantly surpassing other VLLMs! 
 49 | 
 50 | **[2023.11.27]** Video-Bench is released! Data and evaluation code is available now.
 51 | 
 52 | ## 📣 Leaderboard
 53 | Welcome to [**Video-Benchmark Leaderboard**](https://huggingface.co/spaces/LanguageBind/Video-Bench)!
 54 | 
 55 | 🚩🚩🚩 We are delighted to have witnessed the remarkable advancements in video understanding and artificial intelligence alongside the community over the past year. We are proud to announce the launch of Video-Bench, a platform designed to assist developers and users in the field of video analysis.
 56 | 
 57 | 🔥🔥🔥  Video-Bench is committed to promoting the progress of video understanding models and facilitating their evaluation. We are pleased to announce the inaugural Video-Bench Leaderboard. This leaderboard aims to systematically evaluate the performance of video understanding models across various capabilities, including **Prior Knowledge based QA, Comprehension Decision-making, Video-exclusive Understanding,** and more.
 58 | The leaderboard will feature rankings for open-source models, providing an inclusive and comprehensive reference for the industry and research community. We invite developers and researchers working on video understanding models to join Video-Bench and showcase their models' performance advantages in different domains.
 59 | 
 60 | 👋👋👋 We also welcome valuable suggestions and contributions from the community to foster collaborative growth and advancement in video understanding models. If you have any questions or would like to get involved, please feel free to contact us. Let's eagerly anticipate the release of the Video-Bench Leaderboard and the continued progress in video understanding and artificial intelligence!
 61 | 
 62 | ## 🤗 Evaluation
 63 | 
 64 | 1. Clone this repository and navigate to Video-Bench folder
 65 | ```bash
 66 | git clone https://github.com/PKU-YuanGroup/Video-Bench.git
 67 | cd Video-Bench
 68 | 
 69 | ```
 70 | 2. Install additional packages
 71 | ```bash
 72 | pip install -r requirements.txt
 73 | ```
 74 | ### 📂 Data Preparation
 75 | The video data can easily be downloaded from [Huggingface](https://huggingface.co/datasets/LanguageBind/Video-Bench) 
 76 | 
 77 | ### 🏗️ Evaluate your own model
 78 | The code below is just a generalized framework for dataset evaluation, you will need to refine the model loading part according to your own model. Once the code execution is complete, you will find some JSON files named `./Chat_results/{dataset_name}.json`. 
 79 | 
 80 | #### Step1: Chat with your own model to obtain conversation results.
 81 | ```python
 82 | import argparse
 83 | import os
 84 | import json
 85 | 
 86 | parser = argparse.ArgumentParser()
 87 | parser.add_argument("--dataset_name", type=str, default=None, help="The type of LLM")
 88 | parser.add_argument("--Eval_QA_root", type=str, default='./', help="folder containing QA JSON files")
 89 | parser.add_argument("--Eval_Video_root", type=str, default='./', help="folder containing video data")
 90 | parser.add_argument("--chat_conversation_output_folder", type=str, default='./Chat_results', help="")
 91 | args = parser.parse_args()
 92 | 
 93 | Eval_QA_root = args.Eval_QA_root
 94 | Eval_Video_root = args.Eval_Video_root
 95 | dataset_qajson = {
 96 |   "Ucfcrime": f"{Eval_QA_root}/Eval_QA/Ucfcrime_QA_new.json",
 97 |   "Youcook2": f"{Eval_QA_root}/Eval_QA/Youcook2_QA_new.json",
 98 |   "TVQA": f"{Eval_QA_root}/Eval_QA/TVQA_QA_new.json",
 99 |   "MSVD": f"{Eval_QA_root}/Eval_QA/MSVD_QA_new.json",
100 |   "MSRVTT": f"{Eval_QA_root}/Eval_QA/MSRVTT_QA_new.json",
101 |   "Driving-decision-making": f"{Eval_QA_root}/Eval_QA/Driving-decision-making_QA_new.json",
102 |   "NBA": f"{Eval_QA_root}/Eval_QA/NBA_QA_new.json",
103 |   "SQA3D": f"{Eval_QA_root}/Eval_QA/SQA3D_QA_new.json",
104 |   "Driving-exam": f"{Eval_QA_root}/Eval_QA/Driving-exam_QA_new.json",
105 |   "MV": f"{Eval_QA_root}/Eval_QA/MV_QA_new.json",
106 |   "MOT": f"{Eval_QA_root}/Eval_QA/MOT_QA_new.json",
107 |   "ActivityNet": f"{Eval_QA_root}/Eval_QA/ActivityNet_QA_new.json",
108 |   "TGIF": f"{Eval_QA_root}/Eval_QA/TGIF_QA_new.json"
109 | }
110 | 
111 | if args.dataset_name is None:
112 |     dataset_name_list = list(dataset_qajson.keys())
113 | else:
114 |     dataset_name_list = [args.dataset_name]
115 |     print(f'Specifically run {args.dataset_name}')
116 | print(dataset_name_list)
117 | 
118 | os.makedirs(args.chat_conversation_output_folder, exist_ok=True)
119 | 
120 | for dataset_name in dataset_name_list:
121 |     qa_json = dataset_qajson[dataset_name]
122 |     print(f'Dataset name:{dataset_name}, {qa_json=}!')
123 |     with open(qa_json, 'r', encoding='utf-8') as f:
124 |         data = json.load(f)
125 |         
126 |     eval_dict = {}
127 |     for idx, (q_id, item) in enumerate(data.items()):
128 |         try:   
129 |             video_id = item['video_id']
130 |             question = item['question'] 
131 |             if len(item['choices']) == 6:
132 |                 question += f"Choices: A.{item['choices']['A']} B.{item['choices']['B']} C.{item['choices']['C']} D.{item['choices']['D']} E.{item['choices']['E']} F.{item['choices']['F']} \n Among the six options A, B, C, D, E, F above, the one closest to the correct answer is:"
133 |                 candidates = ['A', 'B', 'C', 'D', 'E', 'F']
134 |                 candidates_long = [f" A.{item['choices']['A']}", f"B.{item['choices']['B']}", f"C.{item['choices']['C']}", f"D.{item['choices']['D']}", f"E.{item['choices']['E']}", f"F.{item['choices']['F']}"]
135 |             elif len(item['choices']) == 5:
136 |                 question += f" A.{item['choices']['A']} B.{item['choices']['B']} C.{item['choices']['C']} D.{item['choices']['D']} E.{item['choices']['E']} \n Among the five options A, B, C, D, E above, the one closest to the correct answer is: "
137 |                 candidates = ['A', 'B', 'C', 'D', 'E']
138 |                 candidates_long = [f" A.{item['choices']['A']}", f"B.{item['choices']['B']}", f"C.{item['choices']['C']}", f"D.{item['choices']['D']}", f"E.{item['choices']['E']}"]
139 |             elif len(item['choices']) == 4:
140 |                 question += f" A.{item['choices']['A']} B.{item['choices']['B']} C.{item['choices']['C']} D.{item['choices']['D']} \n Among the four options A, B, C, D above, the one closest to the correct answer is:"
141 |                 candidates = ['A', 'B', 'C', 'D']
142 |                 candidates_long = [f" A.{item['choices']['A']}", f"B.{item['choices']['B']}", f"C.{item['choices']['C']}", f"D.{item['choices']['D']}"]
143 |             elif len(item['choices']) == 3:
144 |                 question += f" A.{item['choices']['A']} B.{item['choices']['B']} C.{item['choices']['C']} \n Among the three options A, B, C above, the one closest to the correct answer is: "
145 |                 candidates = ['A', 'B', 'C']
146 |                 candidates_long = [f" A.{item['choices']['A']}", f"B.{item['choices']['B']}", f"C.{item['choices']['C']}"]
147 |             elif len(item['choices']) == 2:
148 |                 question += f" A.{item['choices']['A']} B.{item['choices']['B']} \n Among the two options A, B above, the one closest to the correct answer is: "
149 |                 candidates = ['A', 'B']
150 |                 candidates_long = [f" A.{item['choices']['A']}", f"B.{item['choices']['B']}"]
151 |             vid_rela_path = item['vid_path']
152 |             vid_path = os.path.join(Eval_Video_root, vid_rela_path)
153 | 
154 | 
155 |             #=================================You need to change this code =========================
156 |             # ......
157 |             output, output_scores = ask(args, question, model, tokenizer, image_processor, vid_path)
158 |             # ......
159 |             #=======================================================================================
160 | 
161 |             eval_dict[q_id] = {
162 |                 'video_id': video_id,
163 |                 'question': question,
164 |                 'output_sequence': output
165 |             }  
166 |             print(f'q_id:{q_id}, output:{output}!\n')
167 |         except Exception as e:
168 |             traceback.print_exc()  
169 |     # eval results
170 |     eval_dataset_json = f'{args.chat_conversation_output_folder}/{dataset_name}_eval.json'
171 |     with open(eval_dataset_json, 'w', encoding='utf-8') as f:
172 |         json.dump(eval_dict, f, indent=2)
173 | 
174 | ```
175 | 
176 | After obtaining the `./Chat_results/{dataset_name}.json` files, you can utilize ChatGPT or T5 model as experts to assess the correctness of the model's output answer. The specific code is as follows:
177 | 
178 | #### Step2: Evaluate your model's answer and obtain final scores across 13 datasets.
179 | 
180 | **ChatGPT Evaluation** 
181 | Note that since chatgpt may answer some formatting errors, you need to run below `Step2_chatgpt_judge.py` multiple times to ensure that each question is validated by chatgpt!
182 | ```python 
183 | python Step2_chatgpt_judge.py  --model_chat_files_folder ./Chat_results  \
184 | --apikey sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \  # --apikey need to specify your openai apikey account
185 | --chatgpt_judge_output_folder  ./ChatGPT_Judge
186 | ```
187 | 
188 | ```python
189 | python Step3_merge_into_one_json.py  --chatgpt_judge_files_folder ./ChatGPT_Judge \
190 | --merge_file  ./Video_Bench_Input.json
191 | ```
192 | <!--
193 | 
194 | **T5 Evaluation**
195 | ```python
196 | python Step2_T5_judge.py  --model_chat_files_folder  ./Chat_results \
197 | --T5_judge_output_folder  ./T5_Judge  \
198 | --Eval_QA_root  ./ 
199 | ```
200 | 
201 | ```python
202 | python Step3_compute_scores_T5.py  --T5_judge_files_folder  ./T5_Judge \
203 | --score_output_file  ./Final_score_table_T5.csv
204 | ```
205 | 
206 | -->
207 | 
208 | After you get the `Video_Bench_Input.json` file, you can submit this file to [Video-Bench leaderboard](https://huggingface.co/spaces/LanguageBind/Video-Bench) to compare with other models!
209 | 
210 | ## 🐳  License
211 | Video-Bench is released under Apache License Version 2.0.
212 | 
213 | ## 🤝 Contributors
214 | 
215 | <a href="https://github.com/PKU-YuanGroup/Video-Bench/graphs/contributors">
216 |   <img src="https://contrib.rocks/image?repo=PKU-YuanGroup/Video-Bench" />
217 | </a>
218 | 


--------------------------------------------------------------------------------
/Step2_T5_judge.py:
--------------------------------------------------------------------------------
  1 | from sentence_transformers import SentenceTransformer, util
  2 | import numpy as np
  3 | import os, json, glob
  4 | import copy
  5 | import pprint
  6 | import argparse
  7 | 
  8 | def T5_similarity(output_sequence=None, chocies_list = None):
  9 |     sentences = [output_sequence]
 10 |     sentences2 = chocies_list
 11 |     model = SentenceTransformer('sentence-transformers/sentence-t5-large', cache_folder='/remote-home/share/VideoBenchmark/Video_Benchmark/T5_evaluation')
 12 |     model = model.cuda()
 13 |     embeddings = model.encode(sentences)
 14 |     embeddings2 = model.encode(sentences2)
 15 |     #Compute cosine-similarities
 16 |     cosine_scores = util.cos_sim(embeddings, embeddings2)
 17 |     index = np.argmax(cosine_scores)
 18 |     return index
 19 | 
 20 | import traceback
 21 | def json_T5_eval(T5_save_folder=None, jsonfile=None, args=None):
 22 |     dataset_qajson = {
 23 |                     "Ucfcrime": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/Ucfcrime_QA_new.json"),
 24 |                     "Youcook2": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/Youcook2_QA_new.json"),
 25 |                     "TVQA": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/TVQA_QA_new.json"),
 26 |                     "MSVD": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/MSVD_QA_new.json"),
 27 |                     "MSRVTT": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/MSRVTT_QA_new.json"),
 28 |                     "Driving-decision-making": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/Driving-decision-making_QA_new.json"),
 29 |                     "NBA": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/NBA_QA_new.json"),
 30 |                     "SQA3D": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/SQA3D_QA_new.json"),
 31 |                     "Driving-exam": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/Driving-exam_QA_new.json"),
 32 |                     "MV": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/MV_QA_new.json"),
 33 |                     "MOT": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/MOT_QA_new.json"),
 34 |                     "ActivityNet": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/ActivityNet_QA_new.json"),
 35 |                     "TGIF": os.path.join( f"{args.Eval_QA_root}", "Eval_QA/TGIF_QA_new.json")
 36 |         }
 37 |     # dataset 的question-choices-answer jsonfile
 38 |     dataset_name = os.path.basename(jsonfile).split('_eval.json')[0]
 39 |     print(f'Dataset name: {dataset_name}')
 40 |     qa_choice_json = dataset_qajson[dataset_name]
 41 |     with open(qa_choice_json, 'r', encoding='utf-8') as f:
 42 |         qa_choice_data = json.load(f)
 43 | 
 44 |     # model chat jsonfile
 45 |     with open(jsonfile, 'r', encoding='utf-8') as f:
 46 |         data = json.load(f)
 47 |     
 48 |     candidates = ['A', 'B', 'C', 'D', 'E', 'F']
 49 |     try:
 50 |         new_data = {}
 51 |         for qid_vid, item in data.items():
 52 |             # 单独qa_t5 eval结果保存
 53 |             os.makedirs(os.path.join(T5_save_folder, os.path.basename(jsonfile).split('.')[0]), exist_ok=True)
 54 |             T5_qidvid_jsonfile = os.path.join(T5_save_folder, os.path.basename(jsonfile).split('.')[0], qid_vid+'.json')
 55 |             if not os.path.exists(T5_qidvid_jsonfile):
 56 |                 new_item = copy.deepcopy(item)
 57 |                 output_sequence = item['output_sequence']
 58 |                 video_id = item['video_id']
 59 |                 qid = qid_vid.replace(f'_{video_id}', '')
 60 |                 choices = qa_choice_data[qid]['choices']
 61 |                 choices = [ f'{alpha}. {choice}' for alpha, choice in choices.items()]
 62 | 
 63 |                 answer_index = T5_similarity(str(output_sequence), choices)
 64 |                 T5_answer = candidates[answer_index]
 65 |                 new_item['t5-answer']= T5_answer
 66 |                 new_item['choices'] = choices
 67 |                 pprint.pprint(new_item)
 68 |                 new_data[qid_vid] = new_item
 69 | 
 70 |                 with open(T5_qidvid_jsonfile, 'w', encoding='utf-8') as f:
 71 |                     json.dump({qid_vid:new_item}, f, indent=2)
 72 |                 print(T5_qidvid_jsonfile, 'is saved!')
 73 |             else:
 74 |                 print(f'{T5_qidvid_jsonfile} is existing!')
 75 | 
 76 |         #一个model的一个dataset 所有qa保存
 77 |         T5_dataset_jsonfile = os.path.join(T5_save_folder, os.path.basename(jsonfile))
 78 |         with open(T5_dataset_jsonfile, 'w', encoding='utf-8') as f:
 79 |             json.dump(new_data, f, indent=2)
 80 |     except Exception as e:
 81 |         print(traceback.print_exc())
 82 |         import ipdb
 83 |         ipdb.set_trace()
 84 |  
 85 | def main(args):
 86 |     evaljson_list = glob.glob(f'{args.model_chat_files_folder}/*_eval.json', recursive=True)
 87 |     print(f'{len(evaljson_list)}') #{evaljson_list},
 88 |     for evaljson in evaljson_list:
 89 |         try:
 90 |             json_T5_eval(args.T5_judge_output_folder, evaljson, args)
 91 |         except Exception as e:
 92 |             print(e)
 93 | 
 94 | if __name__ == "__main__":
 95 |     
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_chat_files_folder", type=str, default="./Chat_results")
 98 |     parser.add_argument("--T5_judge_output_folder", type=str, default="./T5_Judge")
 99 |     parser.add_argument("--Eval_QA_root", type=str, default="/remote-home/share/VideoBenchmark/Video_Benchmark")
100 |     args = parser.parse_args()
101 |     main(args)
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/Step2_chatgpt_judge.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import csv
 3 | import glob
 4 | import os
 5 | import json
 6 | import random
 7 | from concurrent.futures import ThreadPoolExecutor
 8 | import openai
 9 | from retry import retry
10 | from tqdm import tqdm
11 | import time
12 | import argparse
13 | 
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument("--model_chat_files_folder", type=str, default="./Eval_results")
16 | parser.add_argument("--apikey", type=str, default="sk-xxxxxxxxxxxxxxxxx")
17 | parser.add_argument("--chatgpt_judge_output_folder", type=str, default="./ChatGPT_Judge")
18 | args = parser.parse_args()
19 | 
20 | 
21 | def chat_classify(gpt_input, model: str = "gpt-3.5-turbo-0613"):
22 |     @retry(tries=3, delay=10)
23 |     def request_openai_api():
24 |         #===============
25 |         messages = [
26 |             {"role": "system",
27 |              "content": 'As a language expert, please complete the following task.'},
28 |             {"role": "assistant",
29 |              "content": "You are now an answer selection expert, and I will provide you with a question with several options, "
30 |              "as well as a target sentence. Please return the alphabet of the option with the highest probability of matching "
31 |              "this target sentence. Given question with options and the target sequence:\n" + str(gpt_input)},
32 |             {"role": "user",
33 |              "content": 'Please output your responses in the form of a dictionary {"maximum probability":"xxx"}  '
34 |              'where xxx is A or B or C or ...'
35 |             }
36 |         ]   
37 |         response = openai.ChatCompletion.create(
38 |             model=model,
39 |             messages=messages,
40 |         )
41 |         return response
42 |     return request_openai_api()
43 | 
44 | def process_file( eval_file):
45 |     # time.sleep(5)
46 |     openai.api_key = args.apikey
47 | 
48 |     with open(eval_file, 'r', encoding='utf-8') as f:
49 |         eval_data = json.load(f)
50 |     try:
51 |         retry_delay = 2  # 重试延迟（秒）
52 |         for qid_vid, item in eval_data.items():
53 |             gpt_input = {'target sequcene': item['output_sequence'], 
54 |                         'question': item['question'],
55 |                         }
56 |             eval_item_copy = item.copy()
57 |             try:
58 |                 output_folder = os.path.join(args.chatgpt_judge_output_folder, os.path.basename(eval_file).replace('_eval.json', '_chatgpt_eval'))
59 |                 os.makedirs(output_folder, exist_ok=True)
60 |                 output_file =  os.path.join(output_folder, f'{qid_vid}.json')
61 |                 if os.path.exists(output_file):
62 |                     pass
63 |                     # print(f'{output_file} is existing!')
64 |                 else:
65 |                     res = chat_classify(gpt_input)
66 |                     content = res["choices"][0]["message"]["content"]
67 |                     output_chatgpt_choice = json.loads(content)["maximum probability"]
68 |                     if output_chatgpt_choice not in ['A','B','C','D','E','F']:
69 |                         raise KeyError 
70 |                     eval_item_copy['output_chatgpt_choice'] = output_chatgpt_choice
71 |                     save_to_file({qid_vid:eval_item_copy}, output_file)
72 |                 
73 |             except Exception as e:
74 |                 print(f'{e}, {eval_file}, {qid_vid}')
75 |                 time.sleep(retry_delay)
76 |         print(f'{eval_file} is finished!!!!!!!')
77 |     except Exception as e:
78 |         print(f'{eval_file} is error!!!!!!!!')
79 | 
80 | def save_to_file(data, output_file):
81 |     with open(output_file, 'w', encoding='utf-8') as f:
82 |         json.dump(data, f, indent=2)
83 |     print(f'{output_file} is saved!')
84 | 
85 | if __name__ == '__main__':
86 |     
87 |     os.makedirs(args.chatgpt_judge_output_folder, exist_ok=True)
88 |     evaljson_list = glob.glob(f'{args.model_chat_files_folder}/*_eval.json')
89 |     print(evaljson_list)
90 | 
91 |     try:
92 |         with ThreadPoolExecutor(64) as executor:
93 |             results = list(
94 |                 tqdm(executor.map(process_file, evaljson_list), total=len(evaljson_list), desc="Processing and saving files"))
95 |     except Exception as e:
96 |         print(e)
97 |         
98 |     
99 | 


--------------------------------------------------------------------------------
/Step3_merge_into_one_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import glob
 4 | import argparse
 5 | 
 6 | def merge_json(args):
 7 |   sub_folder_name_list = os.listdir(args.chatgpt_judge_files_folder)
 8 |   datastet_name_list = [ sub.replace('_chatgpt_eval', '')  for sub in sub_folder_name_list]
 9 |   datasets_dict = {}
10 |   for i, datastet_name in enumerate(datastet_name_list):
11 |     single_dataset_dict = {}
12 |     jsonfiles = glob.glob(os.path.join(args.chatgpt_judge_files_folder, sub_folder_name_list[i],'*.json' ))
13 |     for jsonfile in jsonfiles:
14 |       with open(jsonfile, 'r', encoding='utf-8') as f:
15 |         data = json.load(f)
16 |         single_dataset_dict.update(data)
17 |     
18 |     datasets_dict[datastet_name] = single_dataset_dict
19 |   
20 |   with open(args.merge_file, 'w', encoding='utf-8') as f:
21 |     json.dump(datasets_dict, f, indent=2)    
22 |     
23 | 
24 | if __name__ == "__main__":
25 |   parser = argparse.ArgumentParser()
26 |   parser.add_argument("--chatgpt_judge_files_folder", type=str, default="/remote-home/share/VideoBenchmark/Video_Benchmark/VLLM-3metrics/Video-LLaVA/ChatGPT_Judge")
27 |   parser.add_argument("--merge_file", type=str, default="./Video-Bench-Input.json")
28 |   args = parser.parse_args()
29 |   dataset_score_dict = merge_json(args)
30 | 


--------------------------------------------------------------------------------
/assets/1:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/assets/leaderboard2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKU-YuanGroup/Video-Bench/8b2101d1f80370121c754caefa3f1cf8b6b979ca/assets/leaderboard2.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKU-YuanGroup/Video-Bench/8b2101d1f80370121c754caefa3f1cf8b6b979ca/assets/logo.png


--------------------------------------------------------------------------------
/assets/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKU-YuanGroup/Video-Bench/8b2101d1f80370121c754caefa3f1cf8b6b979ca/assets/logo2.png


--------------------------------------------------------------------------------
/assets/pie_fig.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKU-YuanGroup/Video-Bench/8b2101d1f80370121c754caefa3f1cf8b6b979ca/assets/pie_fig.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | openai
3 | 


--------------------------------------------------------------------------------