├── logo.png
├── longvideobench
├── __init__.py
└── longvideobench_dataset.py
├── lvb_teaser.png
├── leaderboard_paper.png
├── pyproject.toml
└── README.md
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longvideobench/LongVideoBench/HEAD/logo.png
--------------------------------------------------------------------------------
/longvideobench/__init__.py:
--------------------------------------------------------------------------------
1 | from .longvideobench_dataset import LongVideoBenchDataset
2 |
3 |
--------------------------------------------------------------------------------
/lvb_teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longvideobench/LongVideoBench/HEAD/lvb_teaser.png
--------------------------------------------------------------------------------
/leaderboard_paper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longvideobench/LongVideoBench/HEAD/leaderboard_paper.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "longvideobench"
7 | version = "0.0.1"
8 | description = "LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding"
9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 | "torch>=2.1.1", "decord", "torchvision", "pillow",
17 | ]
18 |
19 |
20 | [project.urls]
21 | "Bug Tracker" = "https://github.com/longvideobench/LongVideoBench/issues"
22 |
23 | [tool.setuptools.packages.find]
24 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
25 |
26 | [tool.wheel]
27 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding
3 |
4 |
5 |
6 |

7 |
8 |
9 |
15 |
16 |
17 |
18 |
19 |
22 |
23 |
24 |
25 |
Introduction
26 | (left) An referring reasoning question. (right) Results with different input frames.
27 |
28 |

29 |
30 |
31 |
Initial Leaderboard
32 |
33 | View more on
HuggingFace Leaderboard.
34 |
35 |
36 |

37 |
38 |
39 |
40 |
41 |
42 | ## [Custom Use] Load the LongVideoBench Dataset
43 |
44 | 1. Download the dataset via Hugging Face CLI:
45 |
46 | ```shell
47 | huggingface-cli download longvideobench/LongVideoBench --repo-type dataset --local-dir LongVideoBench --local-dir-use-symlinks False
48 | ```
49 |
50 | 2. Extract from the `.tar` files:
51 |
52 | ```shell
53 | cat videos.tar.part.* > videos.tar
54 | tar -xvf videos.tar
55 | tar -xvf subtitles.tar
56 | ```
57 |
58 | 3. Use the [LongVideoBench] dataloader to load the data from raw MP4 files and subtitles:
59 |
60 | - (a) Install the dataloader:
61 |
62 | ```shell
63 | git clone https://github.com/LongVideoBench/LongVideoBench.git
64 | cd LongVideoBench
65 | pip install -e .
66 | ```
67 | - (b) Load the dataset in python scripts:
68 |
69 | ```python
70 | from longvideobench import LongVideoBenchDataset
71 |
72 | # validation
73 | dataset = LongVideoBenchDataset(YOUR_DATA_PATH, "lvb_val.json", max_num_frames=64)
74 |
75 | # test
76 | dataset = LongVideoBenchDataset(YOUR_DATA_PATH, "lvb_test_wo_gt.json", max_num_frames=64)
77 |
78 | print(dataset[0]["inputs"]) # A list consisting of PIL.Image and strings.
79 | ```
80 |
81 | The "inputs" are interleaved video frames and text subtitles, followed by questions and option prompts. You can then convert them to the format that your LMMs can accept.
82 |
83 | ## [Automatic] Evaluating with LMMs-Eval
84 |
85 | LongVideoBench has been integrated into [LMMs-Eval](https://github.com/EvolvingLMMs-Lab/lmms-eval) library for automatic evaluation. With datasets and models on Hugging Face, you and can start automatic evaluation once the LMMs-Eval library is properly installed.
86 |
87 | ### Install
88 |
89 | Please install LMMs-Eval as follows:
90 |
91 | ```
92 | git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
93 | cd lmms-eval
94 | pip install -e .
95 | ```
96 |
97 | This will install the GitHub main version that supports tasks: `longvideobench_val_i` (LongVideoBench for Image LMMs) and `longvideobenc_val_v` (LongVideoBench for Video-specific LMMs).
98 |
99 | ### Example Use (Image LMMs)
100 |
101 | We feed 16 frames by default for Image LMMs. To modify this, please go to `lmms_eval/tasks/longvideobench/utils.py` and change the parameter `max_num_frames` to other values (e.g. 4, 8, or 32, or even 64, 128, 256 for proprietary models).
102 |
103 | - Idefics2
104 |
105 | ```
106 | python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --model idefics2 --tasks longvideobench_val_i --batch_size 1 --log_samples --log_samples_suffix idefics2_lvb_i --output_path ./logs/
107 | ```
108 |
109 | - Phi3V
110 |
111 | ```
112 | python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --model phi3v --tasks longvideobench_val_i --batch_size 1 --log_samples --log_samples_suffix phi3v_lvb_i --output_path ./logs/
113 | ```
114 |
115 | ### Example Use (Video-specific LMMs)
116 |
117 | - LLaVA-NeXT-Video-34B-DPO
118 |
119 | (32 frames)
120 |
121 | ```
122 | python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --model llavavid --model_args pretrained="lmms-lab/LLaVA-NeXT-Video-34B-DPO",max_frames_num=32,conv_template=chatml_direct,video_decode_backend="decord" --tasks longvideobench_val_v --batch_size 1 --log_samples --log_samples_suffix llavavid_34b_dpo_lvb_v --output_path ./logs/
123 | ```
124 |
125 | - LLaVA-NeXT-Video-7B-DPO
126 |
127 | (32 frames)
128 |
129 | ```
130 | python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --model llavavid --model_args pretrained="lmms-lab/LLaVA-NeXT-Video-7B-DPO",max_frames_num=32,video_decode_backend="decord" --tasks longvideobench_val_v --batch_size 1 --log_samples --log_samples_suffix llavavid_7b_dpo_lvb_v --output_path ./logs/
131 | ```
132 |
133 | - Video-LLaVA
134 |
135 | (8 frames)
136 |
137 | ```
138 | python3 -m accelerate.commands.launch --num_processes=8 -m lmms_eval --model video_llava --tasks longvideobench_val_v --batch_size 1 --log_samples --log_samples_suffix video_llava_lvb_v --output_path ./logs/
139 | ```
140 |
141 |
142 |
143 | ## Contact
144 |
145 | Please contact `haoning001@e.ntu.edu.sg` for any queries.
146 |
147 | ## License
148 |
149 | This dataset follows CC-BY-NC-SA 4.0 license. Please use this dataset for non-commercial use ONLY.
150 |
151 | ## Citation
152 |
153 | ```bibtex
154 | @misc{wu2024longvideobench,
155 | title={LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding},
156 | author={Haoning Wu and Dongxu Li and Bei Chen and Junnan Li},
157 | year={2024},
158 | eprint={2407.15754},
159 | archivePrefix={arXiv},
160 | primaryClass={cs.CV},
161 | url={https://arxiv.org/abs/2407.15754},
162 | }
163 | ```
--------------------------------------------------------------------------------
/longvideobench/longvideobench_dataset.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 | import os
3 | import decord
4 | from decord import VideoReader, cpu
5 | import numpy as np
6 | from PIL import Image
7 | import torch
8 |
9 | import json
10 |
11 | def timestamp_to_seconds(timestamp):
12 | # Split the timestamp into hours, minutes, and seconds
13 | h, m, s = timestamp.split(':')
14 | # Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
15 | total_seconds = int(h) * 3600 + int(m) * 60 + float(s)
16 | return total_seconds
17 |
18 | def load_video(video_file, duration, max_num_frames=16):
19 | from decord import VideoReader
20 | vr = VideoReader(video_file, ctx=cpu(0), num_threads=1)
21 | fps = vr.get_avg_fps()
22 | total_valid_frames = int(duration * fps)
23 | num_frames = min(max_num_frames, int(duration))
24 |
25 | frame_indices = [int(total_valid_frames / num_frames) * i for i in range(num_frames)]
26 |
27 | frames = vr.get_batch(frame_indices)
28 | if isinstance(frames, torch.Tensor):
29 | frames = frames.numpy()
30 | else:
31 | frames = frames.asnumpy()
32 | frame_timestamps = [frame_index / fps for frame_index in frame_indices]
33 |
34 | return [Image.fromarray(fr).convert("RGB") for fr in frames], frame_timestamps
35 |
36 | def insert_subtitles(subtitles):
37 | interleaved_list = []
38 | cur_i = 0
39 |
40 | for subtitle in subtitles:
41 | if "timestamp" in subtitle:
42 | subtitle_text = subtitle["text"]
43 | else:
44 | subtitle_text = subtitle["line"]
45 |
46 | interleaved_list.append(subtitle_text)
47 |
48 | return interleaved_list
49 |
50 | def insert_subtitles_into_frames(frames, frame_timestamps, subtitles,
51 | starting_timestamp_for_subtitles, duration):
52 | interleaved_list = []
53 | cur_i = 0
54 |
55 | for subtitle in subtitles:
56 | if "timestamp" in subtitle:
57 | start, end = subtitle["timestamp"]
58 |
59 | if not isinstance(end, float):
60 | end = duration
61 |
62 | start -= starting_timestamp_for_subtitles
63 | end -= starting_timestamp_for_subtitles
64 |
65 |
66 | subtitle_timestamp = (start + end) / 2
67 | subtitle_text = subtitle["text"]
68 | else:
69 | start, end = subtitle["start"], subtitle["end"]
70 | start = timestamp_to_seconds(start)
71 | end = timestamp_to_seconds(end)
72 | start -= starting_timestamp_for_subtitles
73 | end -= starting_timestamp_for_subtitles
74 |
75 | subtitle_timestamp = (start + end) / 2
76 | subtitle_text = subtitle["line"]
77 |
78 |
79 | for i, (frame, frame_timestamp) in enumerate(zip(frames[cur_i:], frame_timestamps[cur_i:])):
80 | if frame_timestamp <= subtitle_timestamp:
81 | #print("frame:", frame_timestamp)
82 | interleaved_list.append(frame)
83 | cur_i += 1
84 | else:
85 | break
86 |
87 | if end - start < 1:
88 | end = subtitle_timestamp + 0.5
89 | start = subtitle_timestamp - 0.5
90 |
91 | covering_frames = False
92 | for frame, frame_timestamp in zip(frames, frame_timestamps):
93 | if frame_timestamp < end and frame_timestamp > start:
94 | covering_frames = True
95 | break
96 | #
97 | if covering_frames:
98 | #print("subtitle:", subtitle_timestamp, start, end)
99 | interleaved_list.append(subtitle_text)
100 | else:
101 | pass
102 | #print("leaving out subtitle:", start, end)
103 |
104 | for i, (frame, frame_timestamp) in enumerate(zip(frames[cur_i:], frame_timestamps[cur_i:])):
105 | #print(frame_timestamp)
106 | interleaved_list.append(frame)
107 |
108 | return interleaved_list
109 |
110 | class LongVideoBenchDataset(Dataset):
111 | def __init__(self,
112 | data_path,
113 | annotation_file,
114 | max_num_frames=256,
115 | insert_text=True,
116 | insert_frame=True,
117 | ):
118 | super().__init__()
119 | self.data_path = data_path
120 | self.insert_text = insert_text
121 |
122 | with open(os.path.join(data_path, annotation_file)) as f:
123 | self.data = json.load(f)
124 | self.max_num_frames = max_num_frames
125 |
126 |
127 |
128 | def __getitem__(self, index):
129 | di = self.data[index]
130 |
131 | if self.max_num_frames == 0:
132 | ### No subtitles, no frames
133 | inputs += ["Question: " + di["question"]]
134 | inputs += [". ".join([chr(ord("A")+i), candidate]) for i, candidate in enumerate(di["candidates"])]
135 | inputs += ["Answer with the option's letter from the given choices directly."]
136 | return {"inputs": inputs, "correct_choice": chr(ord("A")+di["correct_choice"]), "id": di["id"]}
137 | if self.max_num_frames == -1:
138 | ### All subtitles, no frames
139 | with open(os.path.join(self.data_path, "subtitles", di["subtitle_path"])) as f:
140 | subtitles = json.load(f)
141 | inputs = insert_subtitles(subtitles)
142 | inputs += ["Question: " + di["question"]]
143 | inputs += [". ".join([chr(ord("A")+i), candidate]) for i, candidate in enumerate(di["candidates"])]
144 | inputs += ["Answer with the option's letter from the given choices directly."]
145 | return {"inputs": inputs, "correct_choice": chr(ord("A")+di["correct_choice"]), "id": di["id"]}
146 |
147 | frames, frame_timestamps = load_video(os.path.join(self.data_path, "videos", di["video_path"]), di["duration"], max_num_frames=self.max_num_frames)
148 |
149 |
150 | with open(os.path.join(self.data_path, "subtitles", di["subtitle_path"])) as f:
151 | subtitles = json.load(f)
152 | inputs = []
153 | if self.insert_text:
154 | inputs = insert_subtitles_into_frames(frames, frame_timestamps, subtitles, di["starting_timestamp_for_subtitles"], di["duration"])
155 | else:
156 | inputs = frames
157 |
158 | ##### YOU MAY MODIFY THE FOLLOWING PART TO ADAPT TO YOUR MODEL #####
159 | inputs += ["Question: " + di["question"]]
160 | inputs += [". ".join([chr(ord("A")+i), candidate]) for i, candidate in enumerate(di["candidates"])]
161 | inputs += ["Answer with the option's letter from the given choices directly."]
162 | ##### YOU MAY MODIFY THE PREVIOUS PART TO ADAPT TO YOUR MODEL #####
163 |
164 | ##### CORRECT CHOICE WILL BE "@" FOR TEST SET SAMPLES #####
165 | return {"inputs": inputs, "correct_choice": chr(ord("A")+di.get("correct_choice", -1)), "id": di["id"]}
166 |
167 | def __len__(self):
168 | return len(self.data)
169 |
170 | def get_id(self, index):
171 | return self.data[index]["id"]
172 |
173 | if __name__ == "__main__":
174 | db = LongVideoBenchDataset("../", "lvb_val.json")
175 | for i in range(10):
176 | print([ele for ele in db[i]["inputs"] if not isinstance(ele, str)])
177 |
178 |
179 |
180 |
--------------------------------------------------------------------------------