├── .gitignore
├── README.md
├── assets
├── cli.gif
├── niavh.png
├── profile.png
└── videollamb.png
├── llava
├── __init__.py
├── constants.py
├── conversation.py
├── eval
│ ├── eval_videoqa.py
│ ├── eval_videoqa_consistency.py
│ ├── eval_videoqa_context.py
│ ├── eval_videoqa_correctness.py
│ ├── eval_videoqa_detailed_orientation.py
│ ├── eval_videoqa_moviechat.py
│ ├── eval_videoqa_temporal.py
│ ├── model_videoqa.py
│ ├── model_videoqa_act.py
│ ├── model_videoqa_consistency.py
│ ├── model_videoqa_general.py
│ ├── model_videoqa_mc.py
│ ├── model_videoqa_moviechat.py
│ ├── model_videoqa_moviechat_ws.py
│ └── model_videoqa_mvbench.py
├── mm_utils.py
├── model
│ ├── __init__.py
│ ├── apply_delta.py
│ ├── builder.py
│ ├── consolidate.py
│ ├── language_model
│ │ ├── llava_llama.py
│ │ ├── llava_llama_retro.py
│ │ ├── llava_llama_retro_lm.py
│ │ ├── llava_mistral.py
│ │ └── llava_mpt.py
│ ├── llava_arch.py
│ ├── make_delta.py
│ ├── multimodal_encoder
│ │ ├── builder.py
│ │ ├── clip_encoder.py
│ │ ├── clip_vid_encoder.py
│ │ ├── deform_clip
│ │ │ ├── __init__.py
│ │ │ ├── deform_clip_configuration.py
│ │ │ └── deform_clip_modeling.py
│ │ ├── deformer
│ │ │ ├── __init__.py
│ │ │ └── deformer_modeling.py
│ │ ├── egovlp
│ │ │ ├── EgoNCE_MLM_ITM_Config.yml
│ │ │ ├── __init__.py
│ │ │ ├── model
│ │ │ │ ├── base.py
│ │ │ │ ├── heads.py
│ │ │ │ ├── metric.py
│ │ │ │ ├── model.py
│ │ │ │ ├── roberta.py
│ │ │ │ ├── util.py
│ │ │ │ └── video_transformer.py
│ │ │ ├── mq.json
│ │ │ ├── parse_config.py
│ │ │ └── processor.py
│ │ ├── languagebind
│ │ │ ├── __init__.py
│ │ │ ├── audio
│ │ │ │ ├── configuration_audio.py
│ │ │ │ ├── modeling_audio.py
│ │ │ │ ├── processing_audio.py
│ │ │ │ └── tokenization_audio.py
│ │ │ ├── depth
│ │ │ │ ├── configuration_depth.py
│ │ │ │ ├── modeling_depth.py
│ │ │ │ ├── processing_depth.py
│ │ │ │ └── tokenization_depth.py
│ │ │ ├── image
│ │ │ │ ├── configuration_image.py
│ │ │ │ ├── modeling_image.py
│ │ │ │ ├── processing_image.py
│ │ │ │ └── tokenization_image.py
│ │ │ ├── rmt_video
│ │ │ │ ├── configuration_video.py
│ │ │ │ ├── modeling_video.py
│ │ │ │ ├── processing_video.py
│ │ │ │ └── tokenization_video.py
│ │ │ ├── thermal
│ │ │ │ ├── configuration_thermal.py
│ │ │ │ ├── modeling_thermal.py
│ │ │ │ ├── processing_thermal.py
│ │ │ │ └── tokenization_thermal.py
│ │ │ └── video
│ │ │ │ ├── configuration_video.py
│ │ │ │ ├── modeling_video.py
│ │ │ │ ├── processing_video.py
│ │ │ │ └── tokenization_video.py
│ │ ├── mae_encoder.py
│ │ ├── rmt_clip
│ │ │ ├── __init__.py
│ │ │ └── rmt_clip_modeling.py
│ │ ├── rmt_vivit
│ │ │ ├── __init__.py
│ │ │ └── modeling_rmt_vivit.py
│ │ ├── videomae_encoder.py
│ │ ├── vit_encoder.py
│ │ └── vivit_encoder.py
│ ├── multimodal_projector
│ │ ├── builder.py
│ │ ├── identity_projector.py
│ │ ├── mlp_projector.py
│ │ ├── mlp_transformer_projector.py
│ │ ├── qformer_projector.py
│ │ ├── retent_transformer_projector.py
│ │ ├── rmt_r_transformer_cap_projector.py
│ │ ├── rmt_r_transformer_projector.py
│ │ ├── rmt_transformer_projector.py
│ │ ├── self_retriever.py
│ │ ├── self_segment.py
│ │ ├── spatial_pool_projector.py
│ │ └── transformer_projector.py
│ └── utils.py
├── serve
│ ├── __init__.py
│ ├── arguments_live.py
│ ├── cli.py
│ ├── cli_streaming.py
│ ├── examples
│ │ ├── desert.jpg
│ │ ├── extreme_ironing.jpg
│ │ ├── sample_demo_1.mp4
│ │ ├── sample_demo_13.mp4
│ │ ├── sample_demo_22.mp4
│ │ ├── sample_demo_3.mp4
│ │ ├── sample_demo_8.mp4
│ │ ├── sample_demo_9.mp4
│ │ ├── sample_img_13.png
│ │ ├── sample_img_22.png
│ │ ├── sample_img_8.png
│ │ ├── videos
│ │ │ ├── cache
│ │ │ │ └── llava
│ │ │ │ │ └── serve
│ │ │ │ │ └── examples
│ │ │ │ │ └── videos
│ │ │ │ │ └── dance_1fps.mp4
│ │ │ ├── dance.json
│ │ │ └── dance.mp4
│ │ └── waterview.jpg
│ ├── gradio_demo.py
│ ├── gradio_utils.py
│ └── inference.py
├── train
│ ├── llama_flash_attn_monkey_patch.py
│ ├── llama_xformers_attn_monkey_patch.py
│ ├── llava_trainer.py
│ ├── train.py
│ ├── train_mem.py
│ └── train_xformers.py
├── utils.py
└── vid_utils.py
├── playground
├── DATA.md
└── eval
│ └── GPT_Zero_Shot_QA
│ ├── EgoPlan_Zero_Shot_QA
│ ├── test_a.json
│ └── test_q.json
│ ├── EgoSchema_Zero_Shot_QA
│ ├── test_a.json
│ └── test_q.json
│ ├── MVBench_Zero_Shot_QA
│ ├── test_a.json
│ └── test_q.json
│ └── NExT_Zero_Shot_QA
│ ├── test_a.json
│ └── test_q.json
├── pyproject.toml
└── scripts
├── eval
├── egoplan.sh
├── egoschema.sh
├── mvbench.sh
└── nextqa.sh
├── eval_acc.py
├── eval_moviechat.py
├── eval_nextqa.py
├── extract_mm_projector.py
├── finetune_video_image.slurm
├── finetune_video_image_loss.slurm
├── gather_moviechat.py
├── merge_lora_weights.py
├── zero1.json
├── zero2.json
├── zero2_offload.json
├── zero3.json
└── zero3_offload.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # this proj
2 | playground/data
3 | checkpoints/
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # poetry
102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | # This is especially recommended for binary packages to ensure reproducibility, and is more
104 | # commonly ignored for libraries.
105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 |
108 | # pdm
109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | # in version control.
113 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
114 | .pdm.toml
115 | .pdm-python
116 | .pdm-build/
117 |
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 |
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 |
125 | # SageMath parsed files
126 | *.sage.py
127 |
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 |
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 |
141 | # Rope project settings
142 | .ropeproject
143 |
144 | # mkdocs documentation
145 | /site
146 |
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 |
152 | # Pyre type checker
153 | .pyre/
154 |
155 | # pytype static type analyzer
156 | .pytype/
157 |
158 | # Cython debug symbols
159 | cython_debug/
160 |
161 | # PyCharm
162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | # and can be added to the global gitignore or merged into this file. For a more nuclear
165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | #.idea/
167 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | # VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges
8 |
9 | [](https://videollamb.github.io/)
10 | [](https://arxiv.org/abs/2409.01071)
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | Recent advancements in large-scale video-language models, demonstrate remarkable capabilities in real-time planning and interaction with real-world environments, yet their training is constrained by high computational costs and limited annotated datasets. Traditional methods, like video compression and sliding window techniques, often compromise critical visual information or disrupt semantic flow. In addition, current predesigned QA benchmarks fail to adequately assess long video understanding due to inherent biases from static image features and the base LLM. To address these issues, we introduce VideoLLaMB, a framework utilizing Memory Bridge Layers with recurrent memory tokens to encode entire video content without discarding vital information. We also propose SceneTilling algorithm to split video into semantic units to keep the semantic flow. Finally, We present the "Needle in a Video Haystack" benchmark to evaluate long video understanding over needle of different modalities comprehensively.
19 |
20 |
21 |
22 |
23 | Table of Contents
24 | - [Install](#install)
25 | - [Quick Start with CLI](#quickstart-with-cli)
26 | - [Streaming Caption with CLI](#streaming-video-caption-with-cli)
27 | - [Demo](#demo)
28 | - [Train](#train)
29 | - [Evaluate](#evaluate)
30 | - [Model Zoo](#model-zoo)
31 | - [Citation](#citation)
32 | - [Acknowledgement](#acknowledgement)
33 |
34 | ## Install
35 |
36 | 1. Clone this repository and navigate to VideoLLaMB folder
37 | ```bash
38 | git clone https://github.com/bigai-nlco/VideoLLaMB.git
39 | cd VideoLLaMB
40 | ```
41 |
42 | 2. Install Package
43 | ```bash
44 | conda create -n videollamb python=3.10 -y
45 | conda activate videollamb
46 | pip install --upgrade pip
47 | pip install -e .
48 | conda install ffmpeg
49 | ```
50 |
51 | 3. Install additional packages for training cases
52 | ```bash
53 | pip install -e ".[train]"
54 | pip install flash-attn --no-build-isolation
55 | pip install flash-attn --no-build-isolation --no-cache-dir
56 | ```
57 |
58 |
59 | ## QuickStart With CLI
60 |
61 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
62 | ```bash
63 | python -m llava.serve.cli --model-path checkpoints/videollamb-llava-1.5-7b --video-file XXX.mp4
64 | ```
65 |
66 |
67 |
68 |
69 | ## Streaming Video Caption with CLI
70 |
71 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
72 | ```bash
73 | python -m llava.serve.cli_streaming --model_path checkpoints/videollamb-llava-1.5-7b
74 | ```
75 |
76 |
77 |
78 | https://github.com/user-attachments/assets/96c32452-f910-4c6c-9feb-0e98134d45a1
79 |
80 |
81 |
82 | ## Gradio Demo
83 |
84 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
85 | ```bash
86 | python -m llava.serve.gradio_demo
87 | ```
88 |
89 |
90 | https://github.com/user-attachments/assets/2ea521e5-4bf2-415c-b20d-f5663c93af57
91 |
92 |
93 |
94 |
95 |
96 |
97 | ## Train
98 |
99 |
100 | 1. Prepare data
101 |
102 | We combine the video instruction from [PLLaVA](https://github.com/magic-research/PLLaVA) and image instruction from [LLaVA](https://github.com/haotian-liu/LLaVA) for training. Please check [DATA](playground/DATA.md) for details.
103 |
104 | 2. Prapare model weights for initialization
105 |
106 | Our model is initialized on LLaVA, you can download the [llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b), and put it to `checkpoints/llava-v1.5-7b`. For visual encoders, we select them from LanguageBind, you can download [LanguageBind_Image](https://huggingface.co/LanguageBind/LanguageBind_Image) and [LanguageBind_Video_merge](https://huggingface.co/LanguageBind/LanguageBind_Video_merge), and put them to `checkpoints/LanguageBind_Image` and `checkpoints/LanguageBind_Video_merge`
107 |
108 |
109 | 3. Start Training
110 |
111 | Training takes 23 hours for LLaVA-1.5-7B in 4-A800-80G
112 |
113 | ```bash
114 | bash scripts/finetune_video_image.slurm # bash
115 | sbatch scripts/finetune_video_image.slurm # slurm cluster
116 | ```
117 |
118 | We also provide a script to backpropagate the LLM loss to the bridge for each recurrent iteration.
119 |
120 | ```bash
121 | bash scripts/finetune_video_image_loss.slurm # bash
122 | sbatch scripts/finetune_video_image_loss.slurm # slurm cluster
123 | ```
124 |
125 |
126 |
127 | ## Evaluate
128 |
129 | 1. Prepare data
130 |
131 | We provide evaluation pipelines for [EgoScheme](https://egoschema.github.io/), [NExTQA](https://egoschema.github.io/), [EgoPlan](https://github.com/ChenYi99/EgoPlan?tab=readme-ov-file#egoplan-evaluation-data), and [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench). Please check [DATA](playground/DATA.md) for details.
132 |
133 | 2. Start Evaluating
134 |
135 | a. Traditional Benchmark
136 |
137 | ```bash
138 | bash scripts/eval/egoschema.sh # egoschema
139 | bash scripts/eval/nextqa.sh # nextqa
140 | bash scripts/eval/egoplan.sh # egoplan
141 | bash scripts/eval/mvbench.sh # mvbench
142 | ```
143 |
144 | b. MM-NIAVH
145 |
146 | check our benchmark [Needle In A Video Haystack (NIAVH)](https://github.com/bigai-nlco/MM-NIAVH)
147 |
148 |
149 |
150 |
151 |
152 |
153 | ## Model Zoo
154 |
155 | |Model |Base Model | Training Data | Download Link |
156 | | ------------- | ------------- | -------- | ------------- |
157 | | VideoLLaMB-7B| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | [🤗videollamb-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB) |
158 | | VideoLLaMB-7B-Mem (MM-NIAVH)| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | [🤗videollamb-mem-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB_Mem) |
159 |
160 |
161 |
162 | ## Acknowledgement
163 |
164 | Model:
165 | - [LLaVA](https://github.com/haotian-liu/LLaVA)
166 | - [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA)
167 | - [recurrent-memory-transformer](https://github.com/booydar/recurrent-memory-transformer)
168 |
169 | Data:
170 | - [PLLaVA](https://github.com/magic-research/PLLaVA)
171 |
172 | Demo:
173 | - [videollm-online](https://github.com/showlab/videollm-online)
174 |
175 | ## Citation
176 |
177 | ```bibtex
178 | @misc{mm-niavh,
179 | title={MLLM Pressure Test: Needle In A Video Haystack},
180 | author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong},
181 | publisher={github},
182 | url={https://github.com/bigai-nlco/NeedleInAVideoHaystack},
183 | year={2024}
184 | }
185 |
186 | @article{videollamb,
187 | title={VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges},
188 | author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong},
189 | journal={arxiv},
190 | year={2024}
191 | }
192 | ```
193 |
194 |
--------------------------------------------------------------------------------
/assets/cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/cli.gif
--------------------------------------------------------------------------------
/assets/niavh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/niavh.png
--------------------------------------------------------------------------------
/assets/profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/profile.png
--------------------------------------------------------------------------------
/assets/videollamb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/videollamb.png
--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 |
--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 |
9 | # # image
10 | IMAGE_TOKEN_INDEX = -200
11 | DEFAULT_IMAGE_TOKEN = ""
12 | DEFAULT_IMAGE_PATCH_TOKEN = ""
13 | DEFAULT_IM_START_TOKEN = ""
14 | DEFAULT_IM_END_TOKEN = ""
15 | IMAGE_PLACEHOLDER = ""
16 |
17 | # # video
18 | VIDEO_TOKEN_INDEX = -201
19 | DEFAULT_VIDEO_TOKEN = "