├── .gitignore ├── README.md ├── assets ├── cli.gif ├── niavh.png ├── profile.png └── videollamb.png ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_videoqa.py │ ├── eval_videoqa_consistency.py │ ├── eval_videoqa_context.py │ ├── eval_videoqa_correctness.py │ ├── eval_videoqa_detailed_orientation.py │ ├── eval_videoqa_moviechat.py │ ├── eval_videoqa_temporal.py │ ├── model_videoqa.py │ ├── model_videoqa_act.py │ ├── model_videoqa_consistency.py │ ├── model_videoqa_general.py │ ├── model_videoqa_mc.py │ ├── model_videoqa_moviechat.py │ ├── model_videoqa_moviechat_ws.py │ └── model_videoqa_mvbench.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_llama_retro.py │ │ ├── llava_llama_retro_lm.py │ │ ├── llava_mistral.py │ │ └── llava_mpt.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ ├── clip_vid_encoder.py │ │ ├── deform_clip │ │ │ ├── __init__.py │ │ │ ├── deform_clip_configuration.py │ │ │ └── deform_clip_modeling.py │ │ ├── deformer │ │ │ ├── __init__.py │ │ │ └── deformer_modeling.py │ │ ├── egovlp │ │ │ ├── EgoNCE_MLM_ITM_Config.yml │ │ │ ├── __init__.py │ │ │ ├── model │ │ │ │ ├── base.py │ │ │ │ ├── heads.py │ │ │ │ ├── metric.py │ │ │ │ ├── model.py │ │ │ │ ├── roberta.py │ │ │ │ ├── util.py │ │ │ │ └── video_transformer.py │ │ │ ├── mq.json │ │ │ ├── parse_config.py │ │ │ └── processor.py │ │ ├── languagebind │ │ │ ├── __init__.py │ │ │ ├── audio │ │ │ │ ├── configuration_audio.py │ │ │ │ ├── modeling_audio.py │ │ │ │ ├── processing_audio.py │ │ │ │ └── tokenization_audio.py │ │ │ ├── depth │ │ │ │ ├── configuration_depth.py │ │ │ │ ├── modeling_depth.py │ │ │ │ ├── processing_depth.py │ │ │ │ └── tokenization_depth.py │ │ │ ├── image │ │ │ │ ├── configuration_image.py │ │ │ │ ├── modeling_image.py │ │ │ │ ├── processing_image.py │ │ │ │ └── tokenization_image.py │ │ │ ├── rmt_video │ │ │ │ ├── configuration_video.py │ │ │ │ ├── modeling_video.py │ │ │ │ ├── processing_video.py │ │ │ │ └── tokenization_video.py │ │ │ ├── thermal │ │ │ │ ├── configuration_thermal.py │ │ │ │ ├── modeling_thermal.py │ │ │ │ ├── processing_thermal.py │ │ │ │ └── tokenization_thermal.py │ │ │ └── video │ │ │ │ ├── configuration_video.py │ │ │ │ ├── modeling_video.py │ │ │ │ ├── processing_video.py │ │ │ │ └── tokenization_video.py │ │ ├── mae_encoder.py │ │ ├── rmt_clip │ │ │ ├── __init__.py │ │ │ └── rmt_clip_modeling.py │ │ ├── rmt_vivit │ │ │ ├── __init__.py │ │ │ └── modeling_rmt_vivit.py │ │ ├── videomae_encoder.py │ │ ├── vit_encoder.py │ │ └── vivit_encoder.py │ ├── multimodal_projector │ │ ├── builder.py │ │ ├── identity_projector.py │ │ ├── mlp_projector.py │ │ ├── mlp_transformer_projector.py │ │ ├── qformer_projector.py │ │ ├── retent_transformer_projector.py │ │ ├── rmt_r_transformer_cap_projector.py │ │ ├── rmt_r_transformer_projector.py │ │ ├── rmt_transformer_projector.py │ │ ├── self_retriever.py │ │ ├── self_segment.py │ │ ├── spatial_pool_projector.py │ │ └── transformer_projector.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── arguments_live.py │ ├── cli.py │ ├── cli_streaming.py │ ├── examples │ │ ├── desert.jpg │ │ ├── extreme_ironing.jpg │ │ ├── sample_demo_1.mp4 │ │ ├── sample_demo_13.mp4 │ │ ├── sample_demo_22.mp4 │ │ ├── sample_demo_3.mp4 │ │ ├── sample_demo_8.mp4 │ │ ├── sample_demo_9.mp4 │ │ ├── sample_img_13.png │ │ ├── sample_img_22.png │ │ ├── sample_img_8.png │ │ ├── videos │ │ │ ├── cache │ │ │ │ └── llava │ │ │ │ │ └── serve │ │ │ │ │ └── examples │ │ │ │ │ └── videos │ │ │ │ │ └── dance_1fps.mp4 │ │ │ ├── dance.json │ │ │ └── dance.mp4 │ │ └── waterview.jpg │ ├── gradio_demo.py │ ├── gradio_utils.py │ └── inference.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py ├── utils.py └── vid_utils.py ├── playground ├── DATA.md └── eval │ └── GPT_Zero_Shot_QA │ ├── EgoPlan_Zero_Shot_QA │ ├── test_a.json │ └── test_q.json │ ├── EgoSchema_Zero_Shot_QA │ ├── test_a.json │ └── test_q.json │ ├── MVBench_Zero_Shot_QA │ ├── test_a.json │ └── test_q.json │ └── NExT_Zero_Shot_QA │ ├── test_a.json │ └── test_q.json ├── pyproject.toml └── scripts ├── eval ├── egoplan.sh ├── egoschema.sh ├── mvbench.sh └── nextqa.sh ├── eval_acc.py ├── eval_moviechat.py ├── eval_nextqa.py ├── extract_mm_projector.py ├── finetune_video_image.slurm ├── finetune_video_image_loss.slurm ├── gather_moviechat.py ├── merge_lora_weights.py ├── zero1.json ├── zero2.json ├── zero2_offload.json ├── zero3.json └── zero3_offload.json /.gitignore: -------------------------------------------------------------------------------- 1 | # this proj 2 | playground/data 3 | checkpoints/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 114 | .pdm.toml 115 | .pdm-python 116 | .pdm-build/ 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |
6 | 7 | # VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges 8 | 9 | [![videollamb-page](https://img.shields.io/badge/videollamb-page-blue)](https://videollamb.github.io/) 10 | [![arXiv](https://img.shields.io/badge/arXiv-2409.01071-b31b1b.svg)](https://arxiv.org/abs/2409.01071) 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | Recent advancements in large-scale video-language models, demonstrate remarkable capabilities in real-time planning and interaction with real-world environments, yet their training is constrained by high computational costs and limited annotated datasets. Traditional methods, like video compression and sliding window techniques, often compromise critical visual information or disrupt semantic flow. In addition, current predesigned QA benchmarks fail to adequately assess long video understanding due to inherent biases from static image features and the base LLM. To address these issues, we introduce VideoLLaMB, a framework utilizing Memory Bridge Layers with recurrent memory tokens to encode entire video content without discarding vital information. We also propose SceneTilling algorithm to split video into semantic units to keep the semantic flow. Finally, We present the "Needle in a Video Haystack" benchmark to evaluate long video understanding over needle of different modalities comprehensively. 19 | 20 | 21 | 22 | 23 | Table of Contents 24 | - [Install](#install) 25 | - [Quick Start with CLI](#quickstart-with-cli) 26 | - [Streaming Caption with CLI](#streaming-video-caption-with-cli) 27 | - [Demo](#demo) 28 | - [Train](#train) 29 | - [Evaluate](#evaluate) 30 | - [Model Zoo](#model-zoo) 31 | - [Citation](#citation) 32 | - [Acknowledgement](#acknowledgement) 33 | 34 | ## Install 35 | 36 | 1. Clone this repository and navigate to VideoLLaMB folder 37 | ```bash 38 | git clone https://github.com/bigai-nlco/VideoLLaMB.git 39 | cd VideoLLaMB 40 | ``` 41 | 42 | 2. Install Package 43 | ```bash 44 | conda create -n videollamb python=3.10 -y 45 | conda activate videollamb 46 | pip install --upgrade pip 47 | pip install -e . 48 | conda install ffmpeg 49 | ``` 50 | 51 | 3. Install additional packages for training cases 52 | ```bash 53 | pip install -e ".[train]" 54 | pip install flash-attn --no-build-isolation 55 | pip install flash-attn --no-build-isolation --no-cache-dir 56 | ``` 57 | 58 | 59 | ## QuickStart With CLI 60 | 61 | Download the checkpoint, place it to the `checkpoints` directory, then run following command: 62 | ```bash 63 | python -m llava.serve.cli --model-path checkpoints/videollamb-llava-1.5-7b --video-file XXX.mp4 64 | ``` 65 | 66 | 67 | 68 | 69 | ## Streaming Video Caption with CLI 70 | 71 | Download the checkpoint, place it to the `checkpoints` directory, then run following command: 72 | ```bash 73 | python -m llava.serve.cli_streaming --model_path checkpoints/videollamb-llava-1.5-7b 74 | ``` 75 | 76 | 77 | 78 | https://github.com/user-attachments/assets/96c32452-f910-4c6c-9feb-0e98134d45a1 79 | 80 | 81 | 82 | ## Gradio Demo 83 | 84 | Download the checkpoint, place it to the `checkpoints` directory, then run following command: 85 | ```bash 86 | python -m llava.serve.gradio_demo 87 | ``` 88 | 89 | 90 | https://github.com/user-attachments/assets/2ea521e5-4bf2-415c-b20d-f5663c93af57 91 | 92 | 93 | 94 | 95 | 96 | 97 | ## Train 98 | 99 | 100 | 1. Prepare data 101 | 102 | We combine the video instruction from [PLLaVA](https://github.com/magic-research/PLLaVA) and image instruction from [LLaVA](https://github.com/haotian-liu/LLaVA) for training. Please check [DATA](playground/DATA.md) for details. 103 | 104 | 2. Prapare model weights for initialization 105 | 106 | Our model is initialized on LLaVA, you can download the [llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b), and put it to `checkpoints/llava-v1.5-7b`. For visual encoders, we select them from LanguageBind, you can download [LanguageBind_Image](https://huggingface.co/LanguageBind/LanguageBind_Image) and [LanguageBind_Video_merge](https://huggingface.co/LanguageBind/LanguageBind_Video_merge), and put them to `checkpoints/LanguageBind_Image` and `checkpoints/LanguageBind_Video_merge` 107 | 108 | 109 | 3. Start Training 110 | 111 | Training takes 23 hours for LLaVA-1.5-7B in 4-A800-80G 112 | 113 | ```bash 114 | bash scripts/finetune_video_image.slurm # bash 115 | sbatch scripts/finetune_video_image.slurm # slurm cluster 116 | ``` 117 | 118 | We also provide a script to backpropagate the LLM loss to the bridge for each recurrent iteration. 119 | 120 | ```bash 121 | bash scripts/finetune_video_image_loss.slurm # bash 122 | sbatch scripts/finetune_video_image_loss.slurm # slurm cluster 123 | ``` 124 | 125 | 126 | 127 | ## Evaluate 128 | 129 | 1. Prepare data 130 | 131 | We provide evaluation pipelines for [EgoScheme](https://egoschema.github.io/), [NExTQA](https://egoschema.github.io/), [EgoPlan](https://github.com/ChenYi99/EgoPlan?tab=readme-ov-file#egoplan-evaluation-data), and [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench). Please check [DATA](playground/DATA.md) for details. 132 | 133 | 2. Start Evaluating 134 | 135 | a. Traditional Benchmark 136 | 137 | ```bash 138 | bash scripts/eval/egoschema.sh # egoschema 139 | bash scripts/eval/nextqa.sh # nextqa 140 | bash scripts/eval/egoplan.sh # egoplan 141 | bash scripts/eval/mvbench.sh # mvbench 142 | ``` 143 | 144 | b. MM-NIAVH 145 | 146 | check our benchmark [Needle In A Video Haystack (NIAVH)](https://github.com/bigai-nlco/MM-NIAVH) 147 | 148 | 149 | 150 | 151 | 152 | 153 | ## Model Zoo 154 | 155 | |Model |Base Model | Training Data | Download Link | 156 | | ------------- | ------------- | -------- | ------------- | 157 | | VideoLLaMB-7B| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | [🤗videollamb-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB) | 158 | | VideoLLaMB-7B-Mem (MM-NIAVH)| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) | [🤗videollamb-mem-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB_Mem) | 159 | 160 | 161 | 162 | ## Acknowledgement 163 | 164 | Model: 165 | - [LLaVA](https://github.com/haotian-liu/LLaVA) 166 | - [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) 167 | - [recurrent-memory-transformer](https://github.com/booydar/recurrent-memory-transformer) 168 | 169 | Data: 170 | - [PLLaVA](https://github.com/magic-research/PLLaVA) 171 | 172 | Demo: 173 | - [videollm-online](https://github.com/showlab/videollm-online) 174 | 175 | ## Citation 176 | 177 | ```bibtex 178 | @misc{mm-niavh, 179 | title={MLLM Pressure Test: Needle In A Video Haystack}, 180 | author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong}, 181 | publisher={github}, 182 | url={https://github.com/bigai-nlco/NeedleInAVideoHaystack}, 183 | year={2024} 184 | } 185 | 186 | @article{videollamb, 187 | title={VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges}, 188 | author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong}, 189 | journal={arxiv}, 190 | year={2024} 191 | } 192 | ``` 193 | 194 | -------------------------------------------------------------------------------- /assets/cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/cli.gif -------------------------------------------------------------------------------- /assets/niavh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/niavh.png -------------------------------------------------------------------------------- /assets/profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/profile.png -------------------------------------------------------------------------------- /assets/videollamb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/videollamb.png -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | 9 | # # image 10 | IMAGE_TOKEN_INDEX = -200 11 | DEFAULT_IMAGE_TOKEN = "" 12 | DEFAULT_IMAGE_PATCH_TOKEN = "" 13 | DEFAULT_IM_START_TOKEN = "" 14 | DEFAULT_IM_END_TOKEN = "" 15 | IMAGE_PLACEHOLDER = "" 16 | 17 | # # video 18 | VIDEO_TOKEN_INDEX = -201 19 | DEFAULT_VIDEO_TOKEN = "