├── .gitignore
├── README.md
├── assets
    ├── cli.gif
    ├── niavh.png
    ├── profile.png
    └── videollamb.png
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_videoqa.py
    │   ├── eval_videoqa_consistency.py
    │   ├── eval_videoqa_context.py
    │   ├── eval_videoqa_correctness.py
    │   ├── eval_videoqa_detailed_orientation.py
    │   ├── eval_videoqa_moviechat.py
    │   ├── eval_videoqa_temporal.py
    │   ├── model_videoqa.py
    │   ├── model_videoqa_act.py
    │   ├── model_videoqa_consistency.py
    │   ├── model_videoqa_general.py
    │   ├── model_videoqa_mc.py
    │   ├── model_videoqa_moviechat.py
    │   ├── model_videoqa_moviechat_ws.py
    │   └── model_videoqa_mvbench.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_llama.py
    │   │   ├── llava_llama_retro.py
    │   │   ├── llava_llama_retro_lm.py
    │   │   ├── llava_mistral.py
    │   │   └── llava_mpt.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   ├── clip_encoder.py
    │   │   ├── clip_vid_encoder.py
    │   │   ├── deform_clip
    │   │   │   ├── __init__.py
    │   │   │   ├── deform_clip_configuration.py
    │   │   │   └── deform_clip_modeling.py
    │   │   ├── deformer
    │   │   │   ├── __init__.py
    │   │   │   └── deformer_modeling.py
    │   │   ├── egovlp
    │   │   │   ├── EgoNCE_MLM_ITM_Config.yml
    │   │   │   ├── __init__.py
    │   │   │   ├── model
    │   │   │   │   ├── base.py
    │   │   │   │   ├── heads.py
    │   │   │   │   ├── metric.py
    │   │   │   │   ├── model.py
    │   │   │   │   ├── roberta.py
    │   │   │   │   ├── util.py
    │   │   │   │   └── video_transformer.py
    │   │   │   ├── mq.json
    │   │   │   ├── parse_config.py
    │   │   │   └── processor.py
    │   │   ├── languagebind
    │   │   │   ├── __init__.py
    │   │   │   ├── audio
    │   │   │   │   ├── configuration_audio.py
    │   │   │   │   ├── modeling_audio.py
    │   │   │   │   ├── processing_audio.py
    │   │   │   │   └── tokenization_audio.py
    │   │   │   ├── depth
    │   │   │   │   ├── configuration_depth.py
    │   │   │   │   ├── modeling_depth.py
    │   │   │   │   ├── processing_depth.py
    │   │   │   │   └── tokenization_depth.py
    │   │   │   ├── image
    │   │   │   │   ├── configuration_image.py
    │   │   │   │   ├── modeling_image.py
    │   │   │   │   ├── processing_image.py
    │   │   │   │   └── tokenization_image.py
    │   │   │   ├── rmt_video
    │   │   │   │   ├── configuration_video.py
    │   │   │   │   ├── modeling_video.py
    │   │   │   │   ├── processing_video.py
    │   │   │   │   └── tokenization_video.py
    │   │   │   ├── thermal
    │   │   │   │   ├── configuration_thermal.py
    │   │   │   │   ├── modeling_thermal.py
    │   │   │   │   ├── processing_thermal.py
    │   │   │   │   └── tokenization_thermal.py
    │   │   │   └── video
    │   │   │   │   ├── configuration_video.py
    │   │   │   │   ├── modeling_video.py
    │   │   │   │   ├── processing_video.py
    │   │   │   │   └── tokenization_video.py
    │   │   ├── mae_encoder.py
    │   │   ├── rmt_clip
    │   │   │   ├── __init__.py
    │   │   │   └── rmt_clip_modeling.py
    │   │   ├── rmt_vivit
    │   │   │   ├── __init__.py
    │   │   │   └── modeling_rmt_vivit.py
    │   │   ├── videomae_encoder.py
    │   │   ├── vit_encoder.py
    │   │   └── vivit_encoder.py
    │   ├── multimodal_projector
    │   │   ├── builder.py
    │   │   ├── identity_projector.py
    │   │   ├── mlp_projector.py
    │   │   ├── mlp_transformer_projector.py
    │   │   ├── qformer_projector.py
    │   │   ├── retent_transformer_projector.py
    │   │   ├── rmt_r_transformer_cap_projector.py
    │   │   ├── rmt_r_transformer_projector.py
    │   │   ├── rmt_transformer_projector.py
    │   │   ├── self_retriever.py
    │   │   ├── self_segment.py
    │   │   ├── spatial_pool_projector.py
    │   │   └── transformer_projector.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── arguments_live.py
    │   ├── cli.py
    │   ├── cli_streaming.py
    │   ├── examples
    │   │   ├── desert.jpg
    │   │   ├── extreme_ironing.jpg
    │   │   ├── sample_demo_1.mp4
    │   │   ├── sample_demo_13.mp4
    │   │   ├── sample_demo_22.mp4
    │   │   ├── sample_demo_3.mp4
    │   │   ├── sample_demo_8.mp4
    │   │   ├── sample_demo_9.mp4
    │   │   ├── sample_img_13.png
    │   │   ├── sample_img_22.png
    │   │   ├── sample_img_8.png
    │   │   ├── videos
    │   │   │   ├── cache
    │   │   │   │   └── llava
    │   │   │   │   │   └── serve
    │   │   │   │   │       └── examples
    │   │   │   │   │           └── videos
    │   │   │   │   │               └── dance_1fps.mp4
    │   │   │   ├── dance.json
    │   │   │   └── dance.mp4
    │   │   └── waterview.jpg
    │   ├── gradio_demo.py
    │   ├── gradio_utils.py
    │   └── inference.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    ├── utils.py
    └── vid_utils.py
├── playground
    ├── DATA.md
    └── eval
    │   └── GPT_Zero_Shot_QA
    │       ├── EgoPlan_Zero_Shot_QA
    │           ├── test_a.json
    │           └── test_q.json
    │       ├── EgoSchema_Zero_Shot_QA
    │           ├── test_a.json
    │           └── test_q.json
    │       ├── MVBench_Zero_Shot_QA
    │           ├── test_a.json
    │           └── test_q.json
    │       └── NExT_Zero_Shot_QA
    │           ├── test_a.json
    │           └── test_q.json
├── pyproject.toml
└── scripts
    ├── eval
        ├── egoplan.sh
        ├── egoschema.sh
        ├── mvbench.sh
        └── nextqa.sh
    ├── eval_acc.py
    ├── eval_moviechat.py
    ├── eval_nextqa.py
    ├── extract_mm_projector.py
    ├── finetune_video_image.slurm
    ├── finetune_video_image_loss.slurm
    ├── gather_moviechat.py
    ├── merge_lora_weights.py
    ├── zero1.json
    ├── zero2.json
    ├── zero2_offload.json
    ├── zero3.json
    └── zero3_offload.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # this proj
  2 | playground/data
  3 | checkpoints/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
114 | .pdm.toml
115 | .pdm-python
116 | .pdm-build/
117 | 
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119 | __pypackages__/
120 | 
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # mkdocs documentation
145 | /site
146 | 
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 | 
152 | # Pyre type checker
153 | .pyre/
154 | 
155 | # pytype static type analyzer
156 | .pytype/
157 | 
158 | # Cython debug symbols
159 | cython_debug/
160 | 
161 | # PyCharm
162 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
165 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
166 | #.idea/
167 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img src="assets/profile.png" width="150" style="margin-bottom: 0.2;"/>
  3 | </p>
  4 | 
  5 | <div align="center">
  6 | 
  7 | # VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges
  8 | 
  9 | [![videollamb-page](https://img.shields.io/badge/videollamb-page-blue)](https://videollamb.github.io/)
 10 | [![arXiv](https://img.shields.io/badge/arXiv-2409.01071-b31b1b.svg)](https://arxiv.org/abs/2409.01071)
 11 | 
 12 | <!-- [![Conference](http://img.shields.io/badge/AnyConference-year-4b44ce.svg)](https://<CONFERENCE>) -->
 13 | 
 14 | </div>
 15 | 
 16 | 
 17 | 
 18 | Recent advancements in large-scale video-language models, demonstrate remarkable capabilities in real-time planning and interaction with real-world environments, yet their training is constrained by high computational costs and limited annotated datasets. Traditional methods, like video compression and sliding window techniques, often compromise critical visual information or disrupt semantic flow. In addition, current predesigned QA benchmarks fail to adequately assess long video understanding due to inherent biases from static image features and the base LLM. To address these issues, we introduce VideoLLaMB, a framework utilizing Memory Bridge Layers with recurrent memory tokens to encode entire video content without discarding vital information. We also propose SceneTilling algorithm to split video into semantic units to keep the semantic flow. Finally, We present the "Needle in a Video Haystack" benchmark to evaluate long video understanding over needle of different modalities comprehensively.
 19 | 
 20 | <img src='assets/videollamb.png'>
 21 | 
 22 | 
 23 | Table of Contents
 24 | - [Install](#install)
 25 | - [Quick Start with CLI](#quickstart-with-cli)
 26 | - [Streaming Caption with CLI](#streaming-video-caption-with-cli)
 27 | - [Demo](#demo)
 28 | - [Train](#train)
 29 | - [Evaluate](#evaluate)
 30 | - [Model Zoo](#model-zoo)
 31 | - [Citation](#citation)
 32 | - [Acknowledgement](#acknowledgement)
 33 | 
 34 | ## Install
 35 | 
 36 | 1. Clone this repository and navigate to VideoLLaMB folder
 37 | ```bash
 38 | git clone https://github.com/bigai-nlco/VideoLLaMB.git
 39 | cd VideoLLaMB
 40 | ```
 41 | 
 42 | 2. Install Package
 43 | ```bash
 44 | conda create -n videollamb python=3.10 -y
 45 | conda activate videollamb
 46 | pip install --upgrade pip
 47 | pip install -e .
 48 | conda install ffmpeg
 49 | ```
 50 | 
 51 | 3. Install additional packages for training cases
 52 | ```bash
 53 | pip install -e ".[train]"
 54 | pip install flash-attn --no-build-isolation
 55 | pip install flash-attn --no-build-isolation --no-cache-dir
 56 | ```
 57 | 
 58 | 
 59 | ## QuickStart With CLI
 60 | 
 61 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
 62 | ```bash
 63 | python -m llava.serve.cli --model-path checkpoints/videollamb-llava-1.5-7b --video-file XXX.mp4
 64 | ```
 65 | 
 66 | <img src="assets/cli.gif" width="500" />
 67 | 
 68 | 
 69 | ## Streaming Video Caption with CLI
 70 | 
 71 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
 72 | ```bash
 73 | python -m llava.serve.cli_streaming --model_path checkpoints/videollamb-llava-1.5-7b
 74 | ```
 75 | 
 76 | 
 77 | 
 78 | https://github.com/user-attachments/assets/96c32452-f910-4c6c-9feb-0e98134d45a1
 79 | 
 80 | 
 81 | 
 82 | ## Gradio Demo
 83 | 
 84 | Download the checkpoint, place it to the `checkpoints` directory, then run following command:
 85 | ```bash
 86 | python -m llava.serve.gradio_demo
 87 | ```
 88 | 
 89 | 
 90 | https://github.com/user-attachments/assets/2ea521e5-4bf2-415c-b20d-f5663c93af57
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | ## Train
 98 | 
 99 | 
100 | 1. Prepare data
101 | 
102 | We combine the video instruction from [PLLaVA](https://github.com/magic-research/PLLaVA) and image instruction from [LLaVA](https://github.com/haotian-liu/LLaVA) for training. Please check [DATA](playground/DATA.md) for details.
103 | 
104 | 2. Prapare model weights for initialization
105 | 
106 | Our model is initialized on LLaVA, you can download the [llava-v1.5-7b](https://huggingface.co/liuhaotian/llava-v1.5-7b), and put it to `checkpoints/llava-v1.5-7b`. For visual encoders, we select them from LanguageBind, you can download [LanguageBind_Image](https://huggingface.co/LanguageBind/LanguageBind_Image) and [LanguageBind_Video_merge](https://huggingface.co/LanguageBind/LanguageBind_Video_merge), and put them to `checkpoints/LanguageBind_Image` and `checkpoints/LanguageBind_Video_merge`
107 | 
108 | 
109 | 3. Start Training
110 | 
111 | Training takes 23 hours for LLaVA-1.5-7B in 4-A800-80G
112 | 
113 | ```bash
114 | bash scripts/finetune_video_image.slurm # bash
115 | sbatch scripts/finetune_video_image.slurm # slurm cluster
116 | ```
117 | 
118 | We also provide a script to backpropagate the LLM loss to the bridge for each recurrent iteration.
119 | 
120 | ```bash
121 | bash scripts/finetune_video_image_loss.slurm # bash
122 | sbatch scripts/finetune_video_image_loss.slurm # slurm cluster
123 | ```
124 | 
125 | 
126 | 
127 | ## Evaluate
128 | 
129 | 1. Prepare data
130 | 
131 | We provide evaluation pipelines for [EgoScheme](https://egoschema.github.io/), [NExTQA](https://egoschema.github.io/), [EgoPlan](https://github.com/ChenYi99/EgoPlan?tab=readme-ov-file#egoplan-evaluation-data), and [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench). Please check [DATA](playground/DATA.md) for details.
132 | 
133 | 2. Start Evaluating 
134 | 
135 | a. Traditional Benchmark
136 | 
137 | ```bash
138 | bash scripts/eval/egoschema.sh # egoschema
139 | bash scripts/eval/nextqa.sh # nextqa
140 | bash scripts/eval/egoplan.sh # egoplan
141 | bash scripts/eval/mvbench.sh # mvbench
142 | ```
143 | 
144 | b. MM-NIAVH
145 | 
146 | check our benchmark [Needle In A Video Haystack (NIAVH)](https://github.com/bigai-nlco/MM-NIAVH)
147 | 
148 | 
149 | <img src="assets/niavh.png" width="500" />
150 | 
151 | 
152 | 
153 | ## Model Zoo
154 | 
155 | |Model      |Base Model      |  Training Data |  Download Link       | 
156 | | ------------- | ------------- | -------- |  ------------- |
157 | | VideoLLaMB-7B| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)        | [🤗videollamb-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB)        | 
158 | | VideoLLaMB-7B-Mem (MM-NIAVH)| llava-v1.5-7b | [magic_json](https://huggingface.co/datasets/cathyxl/magic_jsons), [LLaVA](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)        | [🤗videollamb-mem-llava-1.5-7b](https://huggingface.co/ColorfulAI/VideoLLaMB_Mem)        | 
159 | 
160 | 
161 | 
162 | ## Acknowledgement
163 | 
164 | Model: 
165 | - [LLaVA](https://github.com/haotian-liu/LLaVA)
166 | - [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA)
167 | - [recurrent-memory-transformer](https://github.com/booydar/recurrent-memory-transformer)
168 | 
169 | Data:
170 | - [PLLaVA](https://github.com/magic-research/PLLaVA)
171 | 
172 | Demo:
173 | - [videollm-online](https://github.com/showlab/videollm-online)
174 | 
175 | ## Citation
176 | 
177 | ```bibtex
178 | @misc{mm-niavh,
179 |     title={MLLM Pressure Test: Needle In A Video Haystack},
180 |     author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong},
181 |     publisher={github},
182 |     url={https://github.com/bigai-nlco/NeedleInAVideoHaystack},
183 |     year={2024}
184 | }
185 | 
186 | @article{videollamb,
187 |     title={VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges},
188 |     author={Wang, Yuxuan and Xie, Cihang and Liu, Yang and Zheng, Zilong},
189 |     journal={arxiv},
190 |     year={2024}
191 | }
192 | ```
193 | 
194 | 


--------------------------------------------------------------------------------
/assets/cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/cli.gif


--------------------------------------------------------------------------------
/assets/niavh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/niavh.png


--------------------------------------------------------------------------------
/assets/profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/profile.png


--------------------------------------------------------------------------------
/assets/videollamb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/assets/videollamb.png


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | 
 9 | # # image
10 | IMAGE_TOKEN_INDEX = -200
11 | DEFAULT_IMAGE_TOKEN = "<image>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"
15 | IMAGE_PLACEHOLDER = "<image-placeholder>"
16 | 
17 | # # video
18 | VIDEO_TOKEN_INDEX = -201
19 | DEFAULT_VIDEO_TOKEN = "<video>"
20 | DEFAULT_VIDEO_PATCH_TOKEN = "<vi_patch>"
21 | DEFAULT_VI_START_TOKEN = "<vi_start>"
22 | DEFAULT_VI_END_TOKEN = "<vi_end>"
23 | VIDEO_PLACEHOLDER = "<video-placeholder>"
24 | 
25 | # # X
26 | X_TOKEN_INDEX = {"IMAGE": -200, "VIDEO": -201}
27 | X_INDEX_TOKEN = {-200: "IMAGE", -201: "VIDEO"}
28 | DEFAULT_X_TOKEN = {"IMAGE": "<image>", "VIDEO": "<video>"}
29 | DEFAULT_X_PATCH_TOKEN = {"IMAGE": "<im_patch>", "VIDEO": "<vi_patch>"}
30 | DEFAULT_X_START_TOKEN = {"IMAGE": "<im_start>", "VIDEO": "<vi_start>"}
31 | DEFAULT_X_END_TOKEN = {"IMAGE": "<im_end>", "VIDEO": "<vi_end>"}
32 | X_PLACEHOLDER = {"IMAGE": "<image-placeholder>", "VIDEO": "<video-placeholder>"}
33 | 
34 | 


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # try:
 2 | #     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
 3 | #     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
 4 | #     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
 5 | # except:
 6 | #     pass
 7 | 
 8 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
 9 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
10 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
11 | from .language_model.llava_llama_retro import LlavaLlamaForCausalLMRMT, LlavaConfig


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_llama.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | 
 21 | from transformers import AutoConfig, AutoModelForCausalLM, \
 22 |                          LlamaConfig, LlamaModel, LlamaForCausalLM
 23 | 
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | from transformers.generation.utils import GenerateOutput
 26 | 
 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaConfig(LlamaConfig):
 31 |     model_type = "llava_llama"
 32 | 
 33 | 
 34 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
 35 |     config_class = LlavaConfig
 36 | 
 37 |     def __init__(self, config: LlamaConfig):
 38 |         super(LlavaLlamaModel, self).__init__(config)
 39 | 
 40 | 
 41 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
 42 |     config_class = LlavaConfig
 43 | 
 44 |     def __init__(self, config):
 45 |         super(LlamaForCausalLM, self).__init__(config)
 46 |         # super(LlavaLlamaForCausalLM, self).__init__(config)
 47 |         self.model = LlavaLlamaModel(config)
 48 |         self.pretraining_tp = config.pretraining_tp
 49 |         self.vocab_size = config.vocab_size
 50 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 51 | 
 52 |         # Initialize weights and apply final processing
 53 |         self.post_init()
 54 | 
 55 |     def get_model(self):
 56 |         return self.model
 57 | 
 58 |     def forward(
 59 |         self,
 60 |         input_ids: torch.LongTensor = None,
 61 |         attention_mask: Optional[torch.Tensor] = None,
 62 |         position_ids: Optional[torch.LongTensor] = None,
 63 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 64 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 65 |         labels: Optional[torch.LongTensor] = None,
 66 |         use_cache: Optional[bool] = None,
 67 |         cache_position: Optional[bool] = None,
 68 |         output_attentions: Optional[bool] = None,
 69 |         output_hidden_states: Optional[bool] = None,
 70 |         X: Optional[torch.FloatTensor] = None,
 71 |         X_modalities: Optional[List[str]] = None,
 72 |         X_sizes: Optional[List[List[int]]] = None,
 73 |         return_dict: Optional[bool] = None,
 74 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 75 | 
 76 | 
 77 |         # print("=========forward method before ===========")
 78 |         # print(self.model.image_tower.image_tower.dtype)
 79 |         # print("====================")
 80 | 
 81 |         if inputs_embeds is None:
 82 |             (
 83 |                 input_ids,
 84 |                 position_ids,
 85 |                 attention_mask,
 86 |                 past_key_values,
 87 |                 inputs_embeds,
 88 |                 labels
 89 |             ) = self.prepare_inputs_labels_for_multimodal(
 90 |                 input_ids,
 91 |                 position_ids,
 92 |                 attention_mask,
 93 |                 past_key_values,
 94 |                 labels,
 95 |                 X,
 96 |                 X_sizes,
 97 |                 X_modalities
 98 |             )
 99 | 
100 |         return super().forward(
101 |             input_ids=input_ids,
102 |             attention_mask=attention_mask,
103 |             position_ids=position_ids,
104 |             past_key_values=past_key_values,
105 |             inputs_embeds=inputs_embeds,
106 |             labels=labels,
107 |             use_cache=use_cache,
108 |             output_attentions=output_attentions,
109 |             output_hidden_states=output_hidden_states,
110 |             return_dict=return_dict
111 |         )
112 | 
113 |     @torch.no_grad()
114 |     def generate(
115 |         self,
116 |         inputs: Optional[torch.Tensor] = None,
117 |         X: Optional[torch.Tensor] = None,
118 |         X_modalities: Optional[List[str]] = None,
119 |         X_sizes: Optional[torch.Tensor] = None,
120 |         **kwargs,
121 |     ) -> Union[GenerateOutput, torch.LongTensor]:
122 |         position_ids = kwargs.pop("position_ids", None)
123 |         attention_mask = kwargs.pop("attention_mask", None)
124 |         if "inputs_embeds" in kwargs:
125 |             raise NotImplementedError("`inputs_embeds` is not supported")
126 | 
127 |         if X is not None:
128 |             (
129 |                 inputs,
130 |                 position_ids,
131 |                 attention_mask,
132 |                 _,
133 |                 inputs_embeds,
134 |                 _
135 |             ) = self.prepare_inputs_labels_for_multimodal(
136 |                 inputs,
137 |                 position_ids,
138 |                 attention_mask,
139 |                 None,
140 |                 None,
141 |                 X,
142 |                 X_sizes,
143 |                 X_modalities,
144 |             )
145 |         else:
146 |             inputs_embeds = self.get_model().embed_tokens(inputs)
147 | 
148 |         # print("debug="*20)
149 |         # print(inputs_embeds.shape)
150 |         # print("="*20)
151 |         
152 | 
153 |         return super().generate(
154 |             position_ids=position_ids,
155 |             attention_mask=attention_mask,
156 |             inputs_embeds=inputs_embeds,
157 |             **kwargs
158 |         )
159 | 
160 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
161 |                                       inputs_embeds=None, **kwargs):
162 |         X = kwargs.pop("X", None)
163 |         X_sizes = kwargs.pop("X_sizes", None)
164 |         X_modalities = kwargs.pop("X_modalities", None)
165 |         inputs = super().prepare_inputs_for_generation(
166 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
167 |         )
168 |         if X is not None:
169 |             inputs['X'] = X
170 |         if X_sizes is not None:
171 |             inputs['X_sizes'] = X_sizes
172 |         if X_modalities is not None:
173 |             inputs['X_modalities'] = X_modalities
174 |         return inputs
175 | 
176 | AutoConfig.register("llava_llama", LlavaConfig)
177 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
178 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mistral.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import CrossEntropyLoss
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, \
 23 |                          MistralConfig, MistralModel, MistralForCausalLM
 24 | 
 25 | from transformers.modeling_outputs import CausalLMOutputWithPast
 26 | from transformers.generation.utils import GenerateOutput
 27 | 
 28 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 29 | 
 30 | 
 31 | class LlavaMistralConfig(MistralConfig):
 32 |     model_type = "llava_mistral"
 33 | 
 34 | 
 35 | class LlavaMistralModel(LlavaMetaModel, MistralModel):
 36 |     config_class = LlavaMistralConfig
 37 | 
 38 |     def __init__(self, config: MistralConfig):
 39 |         super(LlavaMistralModel, self).__init__(config)
 40 | 
 41 | 
 42 | class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
 43 |     config_class = LlavaMistralConfig
 44 | 
 45 |     def __init__(self, config):
 46 |         super(MistralForCausalLM, self).__init__(config)
 47 |         self.model = LlavaMistralModel(config)
 48 | 
 49 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 50 | 
 51 |         # Initialize weights and apply final processing
 52 |         self.post_init()
 53 | 
 54 |     def get_model(self):
 55 |         return self.model
 56 | 
 57 |     def forward(
 58 |         self,
 59 |         input_ids: torch.LongTensor = None,
 60 |         attention_mask: Optional[torch.Tensor] = None,
 61 |         position_ids: Optional[torch.LongTensor] = None,
 62 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 63 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 64 |         labels: Optional[torch.LongTensor] = None,
 65 |         use_cache: Optional[bool] = None,
 66 |         output_attentions: Optional[bool] = None,
 67 |         output_hidden_states: Optional[bool] = None,
 68 |         images: Optional[torch.FloatTensor] = None,
 69 |         image_sizes: Optional[List[List[int]]] = None,
 70 |         return_dict: Optional[bool] = None,
 71 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 72 | 
 73 |         if inputs_embeds is None:
 74 |             (
 75 |                 input_ids,
 76 |                 position_ids,
 77 |                 attention_mask,
 78 |                 past_key_values,
 79 |                 inputs_embeds,
 80 |                 labels
 81 |             ) = self.prepare_inputs_labels_for_multimodal(
 82 |                 input_ids,
 83 |                 position_ids,
 84 |                 attention_mask,
 85 |                 past_key_values,
 86 |                 labels,
 87 |                 images,
 88 |                 image_sizes
 89 |             )
 90 | 
 91 |         return super().forward(
 92 |             input_ids=input_ids,
 93 |             attention_mask=attention_mask,
 94 |             position_ids=position_ids,
 95 |             past_key_values=past_key_values,
 96 |             inputs_embeds=inputs_embeds,
 97 |             labels=labels,
 98 |             use_cache=use_cache,
 99 |             output_attentions=output_attentions,
100 |             output_hidden_states=output_hidden_states,
101 |             return_dict=return_dict
102 |         )
103 | 
104 |     @torch.no_grad()
105 |     def generate(
106 |         self,
107 |         inputs: Optional[torch.Tensor] = None,
108 |         images: Optional[torch.Tensor] = None,
109 |         image_sizes: Optional[torch.Tensor] = None,
110 |         **kwargs,
111 |     ) -> Union[GenerateOutput, torch.LongTensor]:
112 |         position_ids = kwargs.pop("position_ids", None)
113 |         attention_mask = kwargs.pop("attention_mask", None)
114 |         if "inputs_embeds" in kwargs:
115 |             raise NotImplementedError("`inputs_embeds` is not supported")
116 | 
117 |         if images is not None:
118 |             (
119 |                 inputs,
120 |                 position_ids,
121 |                 attention_mask,
122 |                 _,
123 |                 inputs_embeds,
124 |                 _
125 |             ) = self.prepare_inputs_labels_for_multimodal(
126 |                 inputs,
127 |                 position_ids,
128 |                 attention_mask,
129 |                 None,
130 |                 None,
131 |                 images,
132 |                 image_sizes=image_sizes
133 |             )
134 |         else:
135 |             inputs_embeds = self.get_model().embed_tokens(inputs)
136 | 
137 |         return super().generate(
138 |             position_ids=position_ids,
139 |             attention_mask=attention_mask,
140 |             inputs_embeds=inputs_embeds,
141 |             **kwargs
142 |         )
143 | 
144 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
145 |                                       inputs_embeds=None, **kwargs):
146 |         images = kwargs.pop("images", None)
147 |         image_sizes = kwargs.pop("image_sizes", None)
148 |         inputs = super().prepare_inputs_for_generation(
149 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150 |         )
151 |         if images is not None:
152 |             inputs['images'] = images
153 |         if image_sizes is not None:
154 |             inputs['image_sizes'] = image_sizes
155 |         return inputs
156 | 
157 | AutoConfig.register("llava_mistral", LlavaMistralConfig)
158 | AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
159 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2023 Haotian Liu
 2 | #
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | #
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 
16 | from typing import Optional, Tuple
17 | 
18 | import torch
19 | 
20 | from transformers import AutoConfig, AutoModelForCausalLM, \
21 |                          MptConfig, MptForCausalLM, MptModel
22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23 | 
24 | 
25 | class LlavaMptConfig(MptConfig):
26 |     model_type = "llava_mpt"
27 | 
28 | 
29 | class LlavaMptModel(LlavaMetaModel, MptModel):
30 |     config_class = LlavaMptConfig
31 | 
32 |     def __init__(self, config: MptConfig):
33 |         config.hidden_size = config.d_model
34 |         super(LlavaMptModel, self).__init__(config)
35 |     
36 |     def embed_tokens(self, x):
37 |         return self.wte(x)
38 | 
39 | 
40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41 |     config_class = LlavaMptConfig
42 |     supports_gradient_checkpointing = True
43 | 
44 |     def __init__(self, config):
45 |         super(MptForCausalLM, self).__init__(config)
46 | 
47 |         self.transformer = LlavaMptModel(config)
48 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49 | 
50 |         # Initialize weights and apply final processing
51 |         self.post_init()
52 | 
53 |     def get_model(self):
54 |         return self.transformer
55 | 
56 |     def _set_gradient_checkpointing(self, module, value=False):
57 |         if isinstance(module, LlavaMptModel):
58 |             module.gradient_checkpointing = value
59 | 
60 |     def forward(
61 |         self,
62 |         input_ids: Optional[torch.LongTensor] = None,
63 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64 |         attention_mask: Optional[torch.Tensor] = None,
65 |         inputs_embeds: Optional[torch.Tensor] = None,
66 |         labels: Optional[torch.Tensor] = None,
67 |         use_cache: Optional[bool] = None,
68 |         output_attentions: Optional[bool] = None,
69 |         output_hidden_states: Optional[bool] = None,
70 |         return_dict: Optional[bool] = None,
71 |         images=None):
72 | 
73 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74 |         
75 |         return super().forward(
76 |             input_ids,
77 |             past_key_values=past_key_values,
78 |             attention_mask=attention_mask,
79 |             inputs_embeds=inputs_embeds,
80 |             labels=labels,
81 |             use_cache=use_cache,
82 |             output_attentions=output_attentions,
83 |             output_hidden_states=output_hidden_states,
84 |             return_dict=return_dict,
85 |         )
86 | 
87 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88 |         images = kwargs.pop("images", None)
89 |         _inputs = super().prepare_inputs_for_generation(
90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91 |         )
92 |         _inputs['images'] = images
93 |         return _inputs
94 | 
95 | 
96 | AutoConfig.register("llava_mpt", LlavaMptConfig)
97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
98 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .clip_encoder import CLIPVisionTower
 4 | from .clip_vid_encoder import CLIPVideoTower
 5 | from .mae_encoder import ViTMAEVisionTower
 6 | from .vit_encoder import ViTVisionTower
 7 | from .vivit_encoder import VivitVisionTower
 8 | from .videomae_encoder import VideoMAEVisionTower
 9 | 
10 | from .languagebind import LanguageBindVideoTower, LanguageBindImageTower, RMTLanguageBindVideoTower
11 | from .deformer import DeformableImageTower
12 | # from .egovlp import EgoVLPTower
13 | 
14 | def build_image_tower(image_tower_cfg, **kwargs):
15 |     image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
16 |     # image_tower = getattr(image_tower_cfg, 'image_tower', None)
17 |     is_absolute_path_exists = os.path.exists(image_tower)
18 |     if "clip" in image_tower:
19 |         if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion") or "ShareGPT4V" in image_tower:
20 |             return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
21 |     elif "mae" in image_tower:
22 |         if is_absolute_path_exists or image_tower.startswith("facebook"):
23 |             return ViTMAEVisionTower(image_tower, args=image_tower_cfg, **kwargs)
24 |     elif "vit" in image_tower:
25 |         if is_absolute_path_exists or image_tower.startswith("google"):
26 |             return ViTVisionTower(image_tower, args=image_tower_cfg, **kwargs)
27 |     elif "LanguageBind_Image" in image_tower:
28 |         if is_absolute_path_exists or image_tower.startswith("LanguageBind"):
29 |             return LanguageBindImageTower(image_tower, args=image_tower_cfg, **kwargs)
30 |     elif "deformable" in image_tower:
31 |         if is_absolute_path_exists or image_tower.startswith("SenseTime"):
32 |             return DeformableImageTower(image_tower, args=image_tower_cfg, **kwargs)
33 | 
34 |     raise ValueError(f'Unknown image tower: {image_tower}')
35 | 
36 | 
37 | def build_video_tower(video_tower_cfg, **kwargs):
38 |     video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
39 |     # video_tower = getattr(video_tower_cfg, 'video_tower', None)
40 |     is_absolute_path_exists = os.path.exists(video_tower)
41 |     if "clip" in video_tower:
42 |         if is_absolute_path_exists or video_tower.startswith("openai") or video_tower.startswith("laion") or "ShareGPT4V" in video_tower:
43 |             return CLIPVideoTower(video_tower, args=video_tower_cfg, **kwargs)
44 |     elif "vivit" in video_tower:
45 |         if is_absolute_path_exists or video_tower.startswith("google"):
46 |             return VivitVisionTower(video_tower, args=video_tower_cfg, **kwargs)
47 |     elif "videomae" in video_tower:
48 |         if is_absolute_path_exists or video_tower.startswith("NCG-NJU"):
49 |             return VideoMAEVisionTower(video_tower, args=video_tower_cfg, **kwargs)
50 |     elif "RMTLanguageBind_Video_merge" in video_tower:
51 |         if is_absolute_path_exists or video_tower.startswith("LanguageBind"):
52 |             return RMTLanguageBindVideoTower(video_tower, args=video_tower_cfg, **kwargs)    
53 |     elif "LanguageBind_Video_merge" in video_tower:
54 |         if is_absolute_path_exists or video_tower.startswith("LanguageBind"):
55 |             return LanguageBindVideoTower(video_tower, args=video_tower_cfg, **kwargs)
56 |     # elif "EgoVLP" in video_tower:
57 |     #     if is_absolute_path_exists or video_tower.startswith("ego"):
58 |     #         return EgoVLPTower(video_tower, args=video_tower_cfg, **kwargs)
59 |     
60 | 
61 |     raise ValueError(f'Unknown video tower: {video_tower}')
62 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
  5 | 
  6 | 
  7 | class CLIPVisionTower(nn.Module):
  8 |     def __init__(self, image_tower, args, delay_load=False):
  9 |         super().__init__()
 10 | 
 11 |         self.is_loaded = False
 12 | 
 13 |         self.image_tower_name = image_tower
 14 |         self.select_layer = args.mm_vision_select_layer
 15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
 16 |         self.freeze_image_tower = getattr(args, 'freeze_image_tower', True)
 17 | 
 18 |         if not delay_load:
 19 |             self.load_model()
 20 |         else:
 21 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.image_tower_name)
 22 |             self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
 23 |             self.image_tower = CLIPVisionModel(self.cfg_only)
 24 | 
 25 |     def load_model(self, device_map=None):
 26 |         if self.is_loaded:
 27 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
 28 |             return
 29 | 
 30 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
 31 |         # from pretrained
 32 |         self.image_tower = CLIPVisionModel.from_pretrained(self.image_tower_name, device_map=device_map)
 33 |         # # from scratch
 34 |         # image_cfg = CLIPVisionConfig.from_pretrained(self.image_tower_name)
 35 |         # self.image_tower = CLIPVisionModel(image_cfg)
 36 |         # self.image_tower.to(device_map)
 37 |         if self.freeze_image_tower:
 38 |             self.image_tower.requires_grad_(False)
 39 | 
 40 |         self.is_loaded = True
 41 | 
 42 |     def feature_select(self, image_forward_outs):
 43 |         image_features = image_forward_outs.hidden_states[self.select_layer]
 44 |         # print("IMAGE: ", image_features.shape)
 45 |         if self.select_feature == 'patch':
 46 |             image_features = image_features.unsqueeze(0)
 47 |             # image_features = image_features[:, 1:]
 48 |         elif self.select_feature == 'cls_patch':
 49 |             image_features = image_features[:, 0]
 50 |         else:
 51 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
 52 |         return image_features
 53 | 
 54 |     # @torch.no_grad()
 55 |     def _forward(self, images):
 56 |         
 57 |         # print("=========forward method===========")
 58 |         # print(self.dtype)
 59 |         # print("====================")
 60 | 
 61 |         if type(images) is list:
 62 |             image_features = []
 63 |             for image in images:
 64 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
 65 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
 66 |                 image_features.append(image_feature)
 67 |         else:
 68 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
 69 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 70 | 
 71 |         return image_features
 72 | 
 73 |     def forward(self, images):
 74 |         if self.freeze_image_tower:
 75 |             with torch.no_grad():
 76 |                 return self._forward(images)
 77 |         else:
 78 |             return self._forward(images)
 79 | 
 80 |     @property
 81 |     def dummy_feature(self):
 82 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 83 | 
 84 |     @property
 85 |     def dtype(self):
 86 |         return self.image_tower.dtype
 87 | 
 88 |     @property
 89 |     def device(self):
 90 |         return self.image_tower.device
 91 | 
 92 |     @property
 93 |     def config(self):
 94 |         if self.is_loaded:
 95 |             return self.image_tower.config
 96 |         else:
 97 |             return self.cfg_only
 98 | 
 99 |     @property
100 |     def hidden_size(self):
101 |         return self.config.hidden_size
102 | 
103 |     @property
104 |     def num_patches_per_side(self):
105 |         return self.config.image_size // self.config.patch_size
106 | 
107 |     @property
108 |     def num_patches(self):
109 |         return (self.config.image_size // self.config.patch_size) ** 2
110 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/clip_vid_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
  5 | 
  6 | class CLIPVideoTower(nn.Module):
  7 |     def __init__(self, video_tower, args, delay_load=False):
  8 |         super().__init__()
  9 | 
 10 |         self.is_loaded = False
 11 | 
 12 |         self.video_tower_name = video_tower
 13 |         self.select_layer = args.mm_vision_select_layer
 14 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
 15 |         self.freeze_video_tower = getattr(args, 'freeze_video_tower', True)
 16 | 
 17 |         if not delay_load:
 18 |             self.load_model()
 19 |         else:
 20 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.video_tower_name)
 21 |             self.video_processor = CLIPImageProcessor.from_pretrained(self.video_tower_name)
 22 |             self.video_tower = CLIPVisionModel(self.cfg_only)
 23 | 
 24 |     def load_model(self, device_map=None):
 25 |         if self.is_loaded:
 26 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.video_tower_name))
 27 |             return
 28 | 
 29 |         self.video_processor = CLIPImageProcessor.from_pretrained(self.video_tower_name)
 30 |         # from pretrained
 31 |         self.video_tower = CLIPVisionModel.from_pretrained(self.video_tower_name, device_map=device_map)
 32 |         # # from scratch
 33 |         # image_cfg = CLIPVisionConfig.from_pretrained(self.video_tower_name)
 34 |         # self.video_tower = CLIPVisionModel(image_cfg)
 35 |         # self.video_tower.to(device_map)
 36 |         if self.freeze_video_tower:
 37 |             self.video_tower.requires_grad_(False)
 38 | 
 39 |         self.is_loaded = True
 40 | 
 41 |     def feature_select(self, image_forward_outs):
 42 |         image_features = image_forward_outs.hidden_states[self.select_layer]
 43 |         # print("VIDEO: ", image_features.shape)
 44 |         if self.select_feature == 'patch':
 45 |             image_features = image_features.unsqueeze(0)
 46 |             # image_features = image_features[:, 1:]
 47 |         elif self.select_feature == 'cls_patch':
 48 |             image_features = image_features[:, 0]
 49 |         else:
 50 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
 51 |         return image_features
 52 | 
 53 |     # @torch.no_grad()
 54 |     def _forward(self, images):
 55 |         
 56 |         # print("=========forward method===========")
 57 |         # print(self.dtype)
 58 |         # print("====================")
 59 |         images = images.squeeze(0)
 60 | 
 61 |         if type(images) is list:
 62 |             image_features = []
 63 |             for image in images:
 64 |                 image_forward_out = self.video_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
 65 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
 66 |                 image_features.append(image_feature)
 67 |         else:
 68 |             image_forward_outs = self.video_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
 69 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 70 | 
 71 |         return image_features
 72 | 
 73 |     def forward(self, images):
 74 |         if self.freeze_video_tower:
 75 |             with torch.no_grad():
 76 |                 return self._forward(images)
 77 |         else:
 78 |             return self._forward(images)
 79 | 
 80 |     @property
 81 |     def dummy_feature(self):
 82 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 83 | 
 84 |     @property
 85 |     def dtype(self):
 86 |         return self.video_tower.dtype
 87 | 
 88 |     @property
 89 |     def device(self):
 90 |         return self.video_tower.device
 91 | 
 92 |     @property
 93 |     def config(self):
 94 |         if self.is_loaded:
 95 |             return self.video_tower.config
 96 |         else:
 97 |             return self.cfg_only
 98 | 
 99 |     @property
100 |     def hidden_size(self):
101 |         return self.config.hidden_size
102 | 
103 |     @property
104 |     def num_patches_per_side(self):
105 |         return self.config.image_size // self.config.patch_size
106 | 
107 |     @property
108 |     def num_patches(self):
109 |         return (self.config.image_size // self.config.patch_size) ** 2
110 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/deform_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, image_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.image_tower_name = image_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 |         self.freeze_image_tower = getattr(args, 'freeze_image_tower', True)
17 | 
18 |         if not delay_load:
19 |             self.load_model()
20 |         else:
21 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.image_tower_name)
22 |             self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
23 |             self.image_tower = CLIPVisionModel(self.cfg_only)
24 | 
25 |     def load_model(self, device_map=None):
26 |         if self.is_loaded:
27 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
28 |             return
29 | 
30 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
31 |         self.image_tower = CLIPVisionModel.from_pretrained(self.image_tower_name, device_map=device_map)
32 |         if self.freeze_image_tower:
33 |             self.image_tower.requires_grad_(False)
34 | 
35 |         self.is_loaded = True
36 | 
37 |     def feature_select(self, image_forward_outs):
38 |         image_features = image_forward_outs.hidden_states[self.select_layer]
39 |         if self.select_feature == 'patch':
40 |             image_features = image_features[:, 1:]
41 |         elif self.select_feature == 'cls_patch':
42 |             image_features = image_features
43 |         else:
44 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
45 |         return image_features
46 | 
47 |     # @torch.no_grad()
48 |     def _forward(self, images):
49 |         if type(images) is list:
50 |             image_features = []
51 |             for image in images:
52 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
53 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
54 |                 image_features.append(image_feature)
55 |         else:
56 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
57 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
58 | 
59 |         return image_features
60 | 
61 |     def forward(self, images):
62 |         if self.freeze_image_tower:
63 |             with torch.no_grad():
64 |                 return self._forward(images)
65 |         else:
66 |             return self._forward(images)
67 | 
68 |     @property
69 |     def dummy_feature(self):
70 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
71 | 
72 |     @property
73 |     def dtype(self):
74 |         return self.image_tower.dtype
75 | 
76 |     @property
77 |     def device(self):
78 |         return self.image_tower.device
79 | 
80 |     @property
81 |     def config(self):
82 |         if self.is_loaded:
83 |             return self.image_tower.config
84 |         else:
85 |             return self.cfg_only
86 | 
87 |     @property
88 |     def hidden_size(self):
89 |         return self.config.hidden_size
90 | 
91 |     @property
92 |     def num_patches_per_side(self):
93 |         return self.config.image_size // self.config.patch_size
94 | 
95 |     @property
96 |     def num_patches(self):
97 |         return (self.config.image_size // self.config.patch_size) ** 2
98 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/deformer/__init__.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from transformers import DeformableDetrConfig, DeformableDetrImageProcessor
  5 | from .deformer_modeling import DeformableVisionModel
  6 | 
  7 | class DeformableImageTower(nn.Module):
  8 |     def __init__(self, image_tower, args, delay_load=False, cache_dir='./cache_dir'):
  9 |         super().__init__()
 10 | 
 11 |         self.is_loaded = False
 12 |         
 13 |         self.image_tower_name = image_tower
 14 |         self.select_layer = args.mm_vision_select_layer
 15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
 16 |         self.freeze_image_tower = getattr(args, 'freeze_image_tower', True)
 17 | 
 18 |         self.cache_dir = cache_dir
 19 | 
 20 |         if not delay_load:
 21 |             self.load_model()
 22 |         else:
 23 |             self.cfg_only = DeformableDetrConfig.from_pretrained(self.image_tower_name, cache_dir=self.cache_dir)
 24 |             self.image_processor = DeformableDetrImageProcessor.from_pretrained(self.image_tower_name)
 25 |             self.image_tower = DeformableVisionModel(self.cfg_only)
 26 | 
 27 |     def load_model(self, device_map=None):
 28 |         if self.is_loaded:
 29 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
 30 |             return
 31 | 
 32 |         self.image_processor = DeformableDetrImageProcessor.from_pretrained(self.image_tower_name)
 33 |         self.image_tower = DeformableVisionModel.from_pretrained(self.image_tower_name, device_map=device_map)
 34 |         if self.freeze_image_tower:
 35 |             self.image_tower.requires_grad_(False)
 36 | 
 37 |         self.is_loaded = True
 38 | 
 39 |     def feature_select(self, image_forward_outs):
 40 |         image_features = image_forward_outs.hidden_states[self.select_layer]
 41 |         # if self.select_feature == 'patch':
 42 |         #     image_features = image_features[:, 1:]
 43 |         # elif self.select_feature == 'cls_patch':
 44 |         #     image_features = image_features
 45 |         # else:
 46 |         #     raise ValueError(f'Unexpected select feature: {self.select_feature}')
 47 |         return image_features
 48 |     
 49 |     
 50 |     # @torch.no_grad()
 51 |     def _forward(self, images):
 52 | 
 53 |         if type(images) is list:
 54 |             image_features = []
 55 |             for image in images:
 56 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
 57 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
 58 |                 image_features.append(image_feature)
 59 |         else:
 60 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
 61 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 62 | 
 63 |         return image_features
 64 | 
 65 |     def forward(self, images):
 66 |         if self.freeze_image_tower:
 67 |             with torch.no_grad():
 68 |                 return self._forward(images)
 69 |         else:
 70 |             return self._forward(images)
 71 | 
 72 |     @property
 73 |     def dummy_feature(self):
 74 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 75 | 
 76 |     @property
 77 |     def dtype(self):
 78 |         return self.image_tower.dtype
 79 | 
 80 |     @property
 81 |     def device(self):
 82 |         return self.image_tower.device
 83 | 
 84 |     @property
 85 |     def config(self):
 86 |         if self.is_loaded:
 87 |             return self.image_tower.config
 88 |         else:
 89 |             return self.cfg_only
 90 | 
 91 |     @property
 92 |     def hidden_size(self):
 93 |         return self.config.d_model
 94 | 
 95 |     # @property
 96 |     # def num_patches_per_side(self):
 97 |     #     return self.config.image_size // self.config.patch_size
 98 | 
 99 |     # @property
100 |     # def num_patches(self):
101 |     #     return (self.config.image_size // self.config.patch_size) ** 2
102 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/EgoNCE_MLM_ITM_Config.yml:
--------------------------------------------------------------------------------
 1 | # Image setting
 2 | input_image_embed_size: 768
 3 | 
 4 | # Text Setting
 5 | #tokenizer = "roberta-base"
 6 | vocab_size: 50265
 7 | mlm_prob: 0.15
 8 | input_text_embed_size: 768
 9 | 
10 | # Transformer Setting
11 | hidden_size: 768
12 | num_heads: 12
13 | num_layers: 12
14 | mlp_ratio: 4
15 | drop_rate: 0.1
16 | num_fuse_block: 6
17 | 
18 | 
19 | # Gradient Checkpoint
20 | use_checkpoint: True
21 | 
22 | # lr_scheduler
23 | decay_power: "cosine"
24 | end_lr: 0.0000001
25 | warmup_steps: 0.1 # This is a floating point indicating % of max_steps
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import sys
  4 | import tqdm
  5 | import argparse
  6 | import numpy as np
  7 | import pdb
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | 
 12 | import llava.model.multimodal_encoder.egovlp.model.model as module_arch
 13 | # from .model.model import FrozenInTime
 14 | from .parse_config import ConfigParser
 15 | 
 16 | 
 17 | class EgoVLPTower(nn.Module):
 18 |     def __init__(self, video_tower, args, delay_load=False, cache_dir="./cache_dir"):
 19 |         super().__init__()
 20 | 
 21 |         self.is_loaded = False
 22 | 
 23 |         self.video_tower_name = video_tower
 24 |         self.select_layer = args.mm_vision_select_layer
 25 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
 26 |         self.freeze_video_tower = getattr(args, 'freeze_video_tower', True)
 27 |         self.num_frames = getattr(args, 'num_frames', 8)
 28 | 
 29 |         self.cache_dir = cache_dir
 30 |         self.args = args
 31 | 
 32 |         
 33 | 
 34 |         if not delay_load:
 35 |             self.load_model()
 36 |         else:
 37 |             self.config = ConfigParser(args, test=True, eval_mode='retrotv')
 38 |             self.config._config['sliding_window_stride'] = -1
 39 |             self.video_tower = self.config.initialize('arch', module_arch)
 40 |             
 41 |     def load_model(self, device_map=None):
 42 |         if self.is_loaded:
 43 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
 44 |             return
 45 | 
 46 |         config = ConfigParser(self.args, test=True, eval_mode="retrotv")
 47 |         config._config['sliding_window_stride'] = -1
 48 |         self.video_tower = config.initialize('arch', module_arch)
 49 |         if self.freeze_video_tower:
 50 |             self.video_tower.requires_grad_(False)
 51 |         self.is_loaded = True
 52 | 
 53 | 
 54 |     def _forward(self, videos):
 55 |         if type(videos) is list:
 56 |             video_features = []
 57 |             for video in videos:
 58 |                 video_forward_out = self.video_tower(video.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
 59 |                 video_feature = video_forward_out.to(video.dtype)
 60 |                 # video_feature = self.feature_select(video_forward_out).to(video.dtype)
 61 |                 video_features.append(video_feature)
 62 |         else:
 63 |             # print(11111111111, videos.shape)
 64 |             video_forward_outs = self.video_tower(videos.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
 65 |             # video_features = self.feature_select(video_forward_outs).to(videos.dtype)
 66 |             video_feature = video_forward_outs.to(video.dtype)
 67 | 
 68 |         return video_features
 69 |     
 70 |     def forward(self, videos):
 71 |         if self.freeze_video_tower:
 72 |             with torch.no_grad():
 73 |                 return self._forward(videos)
 74 |         else:
 75 |             return self._forward(videos)
 76 | 
 77 | 
 78 |     @property
 79 |     def dummy_feature(self):
 80 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
 81 | 
 82 |     @property
 83 |     def dtype(self):
 84 |         return self.video_tower.dtype
 85 | 
 86 |     @property
 87 |     def device(self):
 88 |         return self.video_tower.device
 89 | 
 90 |     @property
 91 |     def config(self):
 92 |         if self.is_loaded:
 93 |             return self.video_tower.config
 94 |         else:
 95 |             return self.cfg_only
 96 | 
 97 |     @property
 98 |     def hidden_size(self):
 99 |         return self.config.hidden_size
100 | 
101 |     @property
102 |     def num_patches_per_side(self):
103 |         return self.config.image_size // self.config.patch_size
104 | 
105 |     @property
106 |     def num_patches(self):
107 |         return (self.config.image_size // self.config.patch_size) ** 2
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     pass


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/model/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | import torch.nn as nn
 9 | import numpy as np
10 | from abc import abstractmethod
11 | 
12 | 
13 | class BaseModel(nn.Module):
14 |     """
15 |     Base class for all models
16 |     """
17 |     @abstractmethod
18 |     def forward(self, *inputs):
19 |         """
20 |         Forward pass logic
21 | 
22 |         :return: Model output
23 |         """
24 |         raise NotImplementedError
25 | 
26 |     def __str__(self):
27 |         """
28 |         Model prints with number of trainable parameters
29 |         """
30 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
31 |         params = sum(np.prod(p.size()) for p in model_parameters)
32 |         return super().__str__() + '\nTrainable parameters: {}'.format(params)
33 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/model/heads.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform
13 | 
14 | 
15 | class Pooler(nn.Module):
16 |     def __init__(self, hidden_size):
17 |         super().__init__()
18 |         self.dense = nn.Linear(hidden_size, hidden_size)
19 |         self.activation = nn.Tanh()
20 | 
21 |     def forward(self, hidden_states):
22 |         first_token_tensor = hidden_states #[:, 0]
23 |         pooled_output = self.dense(first_token_tensor)
24 |         pooled_output = self.activation(pooled_output)
25 |         return pooled_output
26 | 
27 | 
28 | class ITMHead(nn.Module):
29 |     def __init__(self, hidden_size):
30 |         super().__init__()
31 |         self.fc = nn.Linear(hidden_size, 2)
32 | 
33 |     def forward(self, x):
34 |         x = self.fc(x)
35 |         return x
36 | 
37 | 
38 | class MLMHead(nn.Module):
39 |     def __init__(self, config, weight=None):
40 |         super().__init__()
41 |         self.transform = BertPredictionHeadTransform(config)
42 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
43 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
44 |         if weight is not None:
45 |             self.decoder.weight = weight
46 | 
47 |     def forward(self, x):
48 |         x = self.transform(x)
49 |         x = self.decoder(x) + self.bias
50 |         return x
51 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/model/util.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import functools
  8 | import json
  9 | import os
 10 | import socket
 11 | import time
 12 | from collections import OrderedDict
 13 | from datetime import datetime
 14 | from itertools import repeat
 15 | from pathlib import Path
 16 | 
 17 | import humanize
 18 | import numpy as np
 19 | import psutil
 20 | 
 21 | 
 22 | def replace_nested_dict_item(obj, key, replace_value):
 23 |     for k, v in obj.items():
 24 |         if isinstance(v, dict):
 25 |             obj[k] = replace_nested_dict_item(v, key, replace_value)
 26 |     if key in obj:
 27 |         obj[key] = replace_value
 28 |     return obj
 29 | 
 30 | 
 31 | def state_dict_data_parallel_fix(load_state_dict, curr_state_dict):
 32 |     load_keys = list(load_state_dict.keys())
 33 |     curr_keys = list(curr_state_dict.keys())
 34 | 
 35 |     redo_dp = False
 36 |     undo_dp = False
 37 |     if not curr_keys[0].startswith('module.') and load_keys[0].startswith('module.'):   # this
 38 |         undo_dp = True
 39 |     elif curr_keys[0].startswith('module.') and not load_keys[0].startswith('module.'):
 40 |         redo_dp = True
 41 | 
 42 |     if undo_dp: # this
 43 |         from collections import OrderedDict
 44 |         new_state_dict = OrderedDict()
 45 |         for k, v in load_state_dict.items():
 46 |             name = k[7:]  # remove `module.`
 47 |             new_state_dict[name] = v
 48 |         # load params
 49 |     elif redo_dp:
 50 |         from collections import OrderedDict
 51 |         new_state_dict = OrderedDict()
 52 |         for k, v in load_state_dict.items():
 53 |             name = 'module.' + k  # remove `module.`
 54 |             new_state_dict[name] = v
 55 |     else:
 56 |         new_state_dict = load_state_dict
 57 |     return new_state_dict
 58 | 
 59 | def print_numpy(x, val=True, shp=False):
 60 |     """Print the mean, min, max, median, std, and size of a numpy array
 61 |     Parameters:
 62 |         val (bool) -- if print the values of the numpy array
 63 |         shp (bool) -- if print the shape of the numpy array
 64 |     """
 65 |     x = x.astype(np.float64)
 66 |     if shp:
 67 |         print('shape,', x.shape)
 68 |     if val:
 69 |         x = x.flatten()
 70 |         print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
 71 |             np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
 72 | 
 73 | 
 74 | def mkdirs(paths):
 75 |     """create empty directories if they don't exist
 76 |     Parameters:
 77 |         paths (str list) -- a list of directory paths
 78 |     """
 79 |     if isinstance(paths, list) and not isinstance(paths, str):
 80 |         for path in paths:
 81 |             mkdir(path)
 82 |     else:
 83 |         mkdir(paths)
 84 | 
 85 | 
 86 | def mkdir(path):
 87 |     """create a single empty directory if it didn't exist
 88 |     Parameters:
 89 |         path (str) -- a single directory path
 90 |     """
 91 |     if not os.path.exists(path):
 92 |         os.makedirs(path)
 93 | 
 94 | def read_json(fname):
 95 |     with fname.open('rt') as handle:
 96 |         return json.load(handle, object_hook=OrderedDict)
 97 | 
 98 | def write_json(content, fname):
 99 |     with fname.open('wt') as handle:
100 |         json.dump(content, handle, indent=4, sort_keys=False)
101 | 
102 | def inf_loop(data_loader):
103 |     ''' wrapper function for endless data loader. '''
104 |     for loader in repeat(data_loader):
105 |         yield from loader
106 | 
107 | def memory_summary():
108 |     vmem = psutil.virtual_memory()
109 |     msg = (
110 |         f">>> Currently using {vmem.percent}% of system memory "
111 |         f"{humanize.naturalsize(vmem.used)}/{humanize.naturalsize(vmem.available)}"
112 |     )
113 |     print(msg)
114 | 
115 | @functools.lru_cache(maxsize=64, typed=False)
116 | def memcache(path):
117 |     suffix = Path(path).suffix
118 |     print(f"loading features >>>", end=" ")
119 |     tic = time.time()
120 |     if suffix == ".npy":
121 |         res = np_loader(path)
122 |     else:
123 |         raise ValueError(f"unknown suffix: {suffix} for path {path}")
124 |     print(f"[Total: {time.time() - tic:.1f}s] ({socket.gethostname() + ':' + str(path)})")
125 |     return res
126 | 
127 | def np_loader(np_path, l2norm=False):
128 |     with open(np_path, "rb") as f:
129 |         data = np.load(f, encoding="latin1", allow_pickle=True)
130 |     if isinstance(data, np.ndarray) and data.size == 1:
131 |         data = data[()]  # handle numpy dict storage convnetion
132 |     if l2norm:
133 |         print("L2 normalizing features")
134 |         if isinstance(data, dict):
135 |             for key in data:
136 |                 feats_ = data[key]
137 |                 feats_ = feats_ / max(np.linalg.norm(feats_), 1E-6)
138 |                 data[key] = feats_
139 |         elif data.ndim == 2:
140 |             data_norm = np.linalg.norm(data, axis=1)
141 |             data = data / np.maximum(data_norm.reshape(-1, 1), 1E-6)
142 |         else:
143 |             raise ValueError("unexpected data format {}".format(type(data)))
144 |     return data
145 | 
146 | 
147 | class Timer:
148 |     def __init__(self):
149 |         self.cache = datetime.now()
150 | 
151 |     def check(self):
152 |         now = datetime.now()
153 |         duration = now - self.cache
154 |         self.cache = now
155 |         return duration.total_seconds()
156 | 
157 |     def reset(self):
158 |         self.cache = datetime.now()
159 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/mq.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "MQ",
 3 |     "n_gpu": 8,
 4 |     "arch": {
 5 |         "type": "FrozenInTime",
 6 |         "args": {
 7 |             "video_params": {
 8 |                 "model": "SpaceTimeTransformer",
 9 |                 "arch_config": "base_patch16_224",
10 |                 "num_frames": 16,
11 |                 "pretrained": true,
12 |                 "time_init": "zeros"
13 |             },
14 |             "text_params": {
15 |                 "model": "roberta-base",
16 |                 "pretrained": true,
17 |                 "input": "text"
18 |             },
19 |             "projection_dim": 4096,
20 |             "load_checkpoint" : "checkpoints/EgoVLPv2.pth"
21 |         }
22 |     },
23 |     "data_loader": {
24 |         "type": "TextVideoDataLoader",
25 |         "args": {
26 |             "dataset_name": "Ego4D_MQ",
27 |             "data_dir": "/cis/home/shraman/works_meta_2022/Datasets/dataset/ego4d_256",
28 | 	    "meta_dir": "/cis/home/shraman/works_meta_2022/pre-training/EgoVLP_Fused_HardNegITM_Checkpoint_multinode",
29 | 	    "shuffle": true,
30 |             "num_workers": 8,
31 |             "batch_size": 32,
32 |             "split": "test",
33 |             "subsample": "video",
34 |             "text_params": {
35 |                 "input": "text"
36 |             },
37 |             "video_params": {
38 |                 "input_res": 224,
39 |                 "num_frames": 16,
40 |                 "loading": "lax"
41 |             },
42 |             "reader": "decord_start_end"
43 |         }
44 |     },
45 |     "optimizer": {
46 |         "type": "AdamW",
47 |         "args":{
48 |             "lr": 3e-5
49 |         }
50 |     },
51 |     "loss": {
52 |         "type": "NormSoftmaxLoss",
53 |         "args": {
54 |         }
55 |     },
56 |     "metrics": [
57 |         "t2v_metrics",
58 |         "v2t_metrics"
59 |      ],
60 |     "trainer": {
61 |         "epochs": 50,
62 |         "max_samples_per_epoch": 500000,
63 |         "save_dir": "/apdcephfs/private_qinghonglin/video_codebase/EgoVLP/results/MQ",
64 |         "save_period": 2,
65 |         "verbosity": 2,
66 |         "monitor": "min val_loss_0",
67 |         "early_stop": 10,
68 |         "init_val": true,
69 |         "neptune": false
70 |     },
71 |     "visualizer": {
72 |         "type": ""
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/parse_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | 
  8 | import os
  9 | import logging
 10 | import pdb
 11 | from pathlib import Path
 12 | from functools import reduce
 13 | from operator import getitem
 14 | from datetime import datetime
 15 | # import pdb; pdb.set_trace()
 16 | # from logger import setup_logging
 17 | # from .model.util import read_json, write_json
 18 | import time
 19 | import inspect
 20 | 
 21 | 
 22 | def read_json(fname):
 23 |     with fname.open('rt') as handle:
 24 |         return json.load(handle, object_hook=OrderedDict)
 25 | 
 26 | def write_json(content, fname):
 27 |     with fname.open('wt') as handle:
 28 |         json.dump(content, handle, indent=4, sort_keys=False)
 29 | 
 30 | class ConfigParser:
 31 |     def __init__(self, args, options='', timestamp=True, test=False, eval_mode=None):
 32 |         # parse default and custom cli options
 33 |         for opt in options:
 34 |             args.add_argument(*opt.flags, default=None, type=opt.type)
 35 |         args = args.parse_args()
 36 |         self.args = args
 37 |         if args.device:
 38 |             os.environ["CUDA_VISIBLE_DEVICES"] = args.device
 39 |         if args.resume is None:
 40 |             msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example."
 41 |             assert args.config is not None, msg_no_cfg
 42 |             self.cfg_fname = Path(args.config)
 43 |             config = read_json(self.cfg_fname)
 44 |             self.resume = None
 45 |         else:
 46 |             self.resume = Path(args.resume)
 47 |             resume_cfg_fname = Path(args.config)
 48 |             if eval_mode == "epic":
 49 |                 resume_cfg_fname = Path('configs/eval/epic.json')
 50 |             if eval_mode == "charades":
 51 |                 resume_cfg_fname = Path('configs/eval/charades.json')
 52 |             if eval_mode == "nlq":
 53 |                 resume_cfg_fname = Path('configs/eval/nlq.json')
 54 |             if eval_mode == "mq":
 55 |                 resume_cfg_fname = Path('configs/eval/mq.json')
 56 |             if eval_mode == "retrotv":
 57 |                 resume_cfg_fname = Path('llava/model/multimodal_encoder/egovlp/mq.json')
 58 | 
 59 | 
 60 |             config = read_json(resume_cfg_fname)
 61 |             if args.config is not None:
 62 |                 config.update(read_json(Path(args.config)))
 63 | 
 64 |         # load config file and apply custom cli options
 65 |         self._config = _update_config(config, options, args)
 66 | 
 67 |         # set save_dir where trained model and log will be saved.
 68 |         #save_dir = Path(self.config['trainer']['save_dir'])
 69 |         save_dir = Path(args.save_dir)
 70 |         # timestamp = datetime.now().strftime(r'%m%d_%H%M%S') if timestamp else ''
 71 |         timestamp = datetime.now().strftime(r'%m%d_%H') if timestamp else ''
 72 | 
 73 |         exper_name = self.config['name']
 74 | 
 75 |         self._save_dir = save_dir / 'models' /  timestamp
 76 |         self._web_log_dir = save_dir / 'web' /  timestamp
 77 |         self._log_dir = save_dir / 'log' /  timestamp
 78 |         self._tf_dir = save_dir / 'tf' / timestamp
 79 | 
 80 |         if not test:
 81 |             self.save_dir.mkdir(parents=True, exist_ok=True)
 82 |             self.log_dir.mkdir(parents=True, exist_ok=True)
 83 |             self._tf_dir.mkdir(parents=True, exist_ok=True)
 84 | 
 85 |         # if set, remove all previous experiments with the current config
 86 |         if vars(args).get("purge_exp_dir", False):
 87 |             for dirpath in (self._save_dir, self._log_dir, self._web_log_dir):
 88 |                 config_dir = dirpath.parent
 89 |                 existing = list(config_dir.glob("*"))
 90 |                 print(f"purging {len(existing)} directories from config_dir...")
 91 |                 tic = time.time()
 92 |                 os.system(f"rm -rf {config_dir}")
 93 |                 print(f"Finished purge in {time.time() - tic:.3f}s")
 94 | 
 95 |         # save updated config file to the checkpoint dir
 96 |         if not test:
 97 |             write_json(self.config, self.save_dir / 'config.json')
 98 | 
 99 |             # # configure logging module
100 |             # setup_logging(self.log_dir)
101 |             # self.log_levels = {
102 |             #     0: logging.WARNING,
103 |             #     1: logging.INFO,
104 |             #     2: logging.DEBUG
105 |             # }
106 | 
107 |     def initialize(self, name, module, *args, index=None, **kwargs):
108 |         """
109 |         finds a function handle with the name given as 'type' in config, and returns the
110 |         instance initialized with corresponding keyword args given as 'args'.
111 |         """
112 |         if index is None:
113 |             module_name = self[name]['type']
114 |             module_args = dict(self[name]['args'])
115 |             assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed'
116 |             module_args.update(kwargs)
117 |         else:
118 |             module_name = self[name][index]['type']
119 |             module_args = dict(self[name][index]['args'])
120 | 
121 |         # if parameter not in config subdict, then check if it's in global config.
122 |         signature = inspect.signature(getattr(module, module_name).__init__)
123 |         print(module_name)
124 |         for param in signature.parameters.keys():
125 |             if param not in module_args and param in self.config:
126 |                 module_args[param] = self[param]
127 |             if module_name == 'FrozenInTime' and param == 'args':
128 |                 module_args[param] = self.args
129 |             if module_name == 'MultiDistTextVideoDataLoader' and param == 'args':
130 |                 module_args[param] = self.args
131 | 
132 |         return getattr(module, module_name)(*args, **module_args)
133 | 
134 |     def __getitem__(self, name):
135 |         return self.config[name]
136 | 
137 |     def get_logger(self, name, verbosity=2):
138 |         msg_verbosity = 'verbosity option {} is invalid. Valid options are {}.'.format(verbosity,
139 |                                                                                        self.log_levels.keys())
140 |         assert verbosity in self.log_levels, msg_verbosity
141 |         logger = logging.getLogger(name)
142 |         logger.setLevel(self.log_levels[verbosity])
143 |         return logger
144 | 
145 |     # setting read-only attributes
146 |     @property
147 |     def config(self):
148 |         return self._config
149 | 
150 |     @property
151 |     def save_dir(self):
152 |         return self._save_dir
153 | 
154 |     @property
155 |     def log_dir(self):
156 |         return self._log_dir
157 | 
158 |     @property
159 |     def tf_dir(self):
160 |         return self._tf_dir
161 | 
162 | # helper functions used to update config dict with custom cli options
163 | def _update_config(config, options, args):
164 |     for opt in options:
165 |         value = getattr(args, _get_opt_name(opt.flags))
166 |         if value is not None:
167 |             _set_by_path(config, opt.target, value)
168 |     return config
169 | 
170 | 
171 | def _get_opt_name(flags):
172 |     for flg in flags:
173 |         if flg.startswith('--'):
174 |             return flg.replace('--', '')
175 |     return flags[0].replace('--', '')
176 | 
177 | 
178 | def _set_by_path(tree, keys, value):
179 |     """Set a value in a nested object in tree by sequence of keys."""
180 |     _get_by_path(tree, keys[:-1])[keys[-1]] = value
181 | 
182 | 
183 | def _get_by_path(tree, keys):
184 |     """Access a nested object in tree by sequence of keys."""
185 |     return reduce(getitem, keys, tree)
186 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/egovlp/processor.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import cv2
  4 | import decord
  5 | from decord import VideoReader, cpu
  6 | decord.bridge.set_bridge('torch')
  7 | import numpy as np
  8 | from PIL import Image
  9 | from torchvision import transforms
 10 | from transformers import ProcessorMixin, BatchEncoding
 11 | from transformers.image_processing_utils import BatchFeature
 12 | from pytorchvideo.data.encoded_video import EncodedVideo
 13 | from torchvision.transforms import Compose, Lambda, ToTensor
 14 | from torchvision.transforms._transforms_video import NormalizeVideo, RandomCropVideo, RandomHorizontalFlipVideo, CenterCropVideo
 15 | from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample
 16 | 
 17 | 
 18 | EGOVLP_MEAN = (0.485, 0.456, 0.406)
 19 | EGOVLP_STD = (0.229, 0.224, 0.225)
 20 | 
 21 | def make_list_of_images(x):
 22 |     if not isinstance(x, list):
 23 |         return [x]
 24 |     return x
 25 | 
 26 | def get_video_transform(config):
 27 |     config = config.vision_config
 28 |     if config.video_decode_backend == 'pytorchvideo':
 29 |         transform = ApplyTransformToKey(
 30 |             key="video",
 31 |             transform=Compose(
 32 |                 [
 33 |                     UniformTemporalSubsample(config.num_frames),
 34 |                     Lambda(lambda x: x / 255.0),
 35 |                     NormalizeVideo(mean=EGOVLP_MEAN, std=EGOVLP_STD),
 36 |                     ShortSideScale(size=224),
 37 |                     CenterCropVideo(224),
 38 |                     RandomHorizontalFlipVideo(p=0.5),
 39 |                 ]
 40 |             ),
 41 |         )
 42 | 
 43 |     elif config.video_decode_backend == 'decord':
 44 | 
 45 |         transform = Compose(
 46 |             [
 47 |                 # UniformTemporalSubsample(num_frames),
 48 |                 Lambda(lambda x: x / 255.0),
 49 |                 NormalizeVideo(mean=EGOVLP_MEAN, std=EGOVLP_STD),
 50 |                 ShortSideScale(size=224),
 51 |                 CenterCropVideo(224),
 52 |                 RandomHorizontalFlipVideo(p=0.5),
 53 |             ]
 54 |         )
 55 | 
 56 |     elif config.video_decode_backend == 'opencv':
 57 |         transform = Compose(
 58 |             [
 59 |                 # UniformTemporalSubsample(num_frames),
 60 |                 Lambda(lambda x: x / 255.0),
 61 |                 NormalizeVideo(mean=EGOVLP_MEAN, std=EGOVLP_STD),
 62 |                 ShortSideScale(size=224),
 63 |                 CenterCropVideo(224),
 64 |                 RandomHorizontalFlipVideo(p=0.5),
 65 |             ]
 66 |         )
 67 |     else:
 68 |         raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
 69 |     return transform
 70 | 
 71 | 
 72 | def load_and_transform_video(
 73 |         video_path,
 74 |         transform,
 75 |         video_decode_backend='decord',
 76 |         clip_start_sec=0.0,
 77 |         clip_end_sec=None,
 78 |         num_frames=8,
 79 | ):
 80 |     if video_decode_backend == 'pytorchvideo':
 81 |         #  decord pyav
 82 |         video = EncodedVideo.from_path(video_path, decoder="decord", decode_audio=False)
 83 |         duration = video.duration
 84 |         start_sec = clip_start_sec  # secs
 85 |         end_sec = clip_end_sec if clip_end_sec is not None else duration  # secs
 86 |         video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
 87 |         video_outputs = transform(video_data)
 88 |         video_outputs = video_outputs.permute(1, 0, 2, 3)
 89 | 
 90 |     elif video_decode_backend == 'decord':
 91 |         decord.bridge.set_bridge('torch')
 92 |         decord_vr = VideoReader(video_path, ctx=cpu(0))
 93 |         duration = len(decord_vr)
 94 |         frame_id_list = np.linspace(0, duration-1, num_frames, dtype=int)
 95 |         video_data = decord_vr.get_batch(frame_id_list)
 96 |         video_data = video_data.permute(3, 0, 1, 2)  # (T, H, W, C) -> (C, T, H, W)
 97 |         video_outputs = transform(video_data)
 98 |         video_outputs = video_outputs.permute(1, 0, 2, 3)
 99 | 
100 |     elif video_decode_backend == 'opencv':
101 |         cv2_vr = cv2.VideoCapture(video_path)
102 |         duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
103 |         frame_id_list = np.linspace(0, duration-5, num_frames, dtype=int)
104 | 
105 |         video_data = []
106 |         for frame_idx in frame_id_list:
107 |             cv2_vr.set(1, frame_idx)
108 |             ret, frame = cv2_vr.read()
109 |             if not ret:
110 |                 raise ValueError(f'video error at {video_path}')
111 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
112 |             video_data.append(torch.from_numpy(frame).permute(2, 0, 1))
113 |         cv2_vr.release()
114 |         video_data = torch.stack(video_data, dim=1)
115 |         video_outputs = transform(video_data)
116 |         video_outputs = video_outputs.permute(1, 0, 2, 3)
117 |     else:
118 |         raise NameError('video_decode_backend should specify in (pytorchvideo, decord, opencv)')
119 |     return video_outputs
120 | 
121 | class EgoVLPVideoProcessor(ProcessorMixin):
122 |     attributes = []
123 |     tokenizer_class = ("EgoVLPVideoTokenizer")
124 | 
125 |     def __init__(self, config, tokenizer=None, **kwargs):
126 |         super().__init__(**kwargs)
127 |         self.config = config
128 |         # self.config.vision_config.video_decode_backend = 'decord'
129 |         # self.config.vision_config.num_frames = 16 # default 8
130 |         self.transform = get_video_transform(config)
131 |         self.image_processor = load_and_transform_video
132 |         self.tokenizer = tokenizer
133 |         
134 | 
135 |     def __call__(self, videos=None, text=None, context_length=77, return_tensors=None, **kwargs):
136 |         if text is None and videos is None:
137 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
138 | 
139 |         if text is not None:
140 |             encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
141 |                                       truncation=True, return_tensors=return_tensors, **kwargs)
142 | 
143 |         if videos is not None:
144 |             videos = make_list_of_images(videos)
145 |             image_features = [self.image_processor(video, self.transform,
146 |                                                    video_decode_backend=self.config.vision_config.video_decode_backend,
147 |                                                    num_frames=self.config.vision_config.num_frames) for video in videos] # iteratively preprocess image
148 |             # image_features = [torch.rand(3, 8, 224, 224) for image in images]
149 |             image_features = torch.stack(image_features)
150 | 
151 |         if text is not None and videos is not None:
152 |             encoding["pixel_values"] = image_features
153 |             return encoding
154 |         elif text is not None:
155 |             return encoding
156 |         else:
157 |             return {"pixel_values": image_features}
158 | 
159 |     def preprocess(self, images, return_tensors):
160 |         return self.__call__(images=images, return_tensors=return_tensors)
161 | 
162 |     def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
163 |         """
164 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
165 |         refer to the docstring of this method for more information.
166 |         """
167 |         return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
168 | 
169 |     def decode(self, skip_special_tokens=True, *args, **kwargs):
170 |         """
171 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
172 |         the docstring of this method for more information.
173 |         """
174 |         return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
175 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/audio/tokenization_audio.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Audio": "https://huggingface.co/lb203/LanguageBind-Audio/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Audio": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Audio": {},
27 | }
28 | 
29 | class LanguageBindAudioTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindAudioTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/depth/processing_depth.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import torch
  3 | from PIL import Image
  4 | from torch import nn
  5 | from torchvision import transforms
  6 | from transformers import ProcessorMixin, BatchEncoding
  7 | from transformers.image_processing_utils import BatchFeature
  8 | 
  9 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 10 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
 11 | 
 12 | def make_list_of_images(x):
 13 |     if not isinstance(x, list):
 14 |         return [x]
 15 |     return x
 16 | 
 17 | def opencv_loader(path):
 18 |     return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
 19 | 
 20 | 
 21 | class DepthNorm(nn.Module):
 22 |     def __init__(
 23 |         self,
 24 |         max_depth=0,
 25 |         min_depth=0.01,
 26 |     ):
 27 |         super().__init__()
 28 |         self.max_depth = max_depth
 29 |         self.min_depth = min_depth
 30 |         self.scale = 1000.0  # nyuv2 abs.depth
 31 | 
 32 |     def forward(self, image):
 33 |         # image = np.array(image)
 34 |         depth_img = image / self.scale  # (H, W)   in meters
 35 |         depth_img = depth_img.clip(min=self.min_depth)
 36 |         if self.max_depth != 0:
 37 |             depth_img = depth_img.clip(max=self.max_depth)
 38 |             depth_img /= self.max_depth   #  0-1
 39 |         else:
 40 |             depth_img /= depth_img.max()
 41 |         depth_img = torch.from_numpy(depth_img).unsqueeze(0).repeat(3, 1, 1)  # assume image
 42 |         return depth_img.to(torch.get_default_dtype())
 43 | 
 44 | def get_depth_transform(config):
 45 |     config = config.vision_config
 46 |     transform = transforms.Compose(
 47 |         [
 48 |             DepthNorm(max_depth=config.max_depth),
 49 |             transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
 50 |             transforms.CenterCrop(224),
 51 |             transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD),  # assume image
 52 |             # transforms.Normalize((0.5, ), (0.5, ))  # 0-1 to norm distribution
 53 |             # transforms.Normalize((0.0418, ), (0.0295, ))  # sun rgb-d  imagebind
 54 |             # transforms.Normalize((0.02, ), (0.00295, ))  # nyuv2
 55 |         ]
 56 |     )
 57 |     return transform
 58 | 
 59 | def load_and_transform_depth(depth_path, transform):
 60 |     depth = opencv_loader(depth_path)
 61 |     depth_outputs = transform(depth)
 62 |     return depth_outputs
 63 | 
 64 | class LanguageBindDepthProcessor(ProcessorMixin):
 65 |     attributes = []
 66 |     tokenizer_class = ("LanguageBindDepthTokenizer")
 67 | 
 68 |     def __init__(self, config, tokenizer=None, **kwargs):
 69 |         super().__init__(**kwargs)
 70 |         self.config = config
 71 |         self.transform = get_depth_transform(config)
 72 |         self.image_processor = load_and_transform_depth
 73 |         self.tokenizer = tokenizer
 74 | 
 75 |     def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
 76 |         if text is None and images is None:
 77 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
 78 | 
 79 |         if text is not None:
 80 |             encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
 81 |                                       truncation=True, return_tensors=return_tensors, **kwargs)
 82 | 
 83 |         if images is not None:
 84 |             images = make_list_of_images(images)
 85 |             image_features = [self.image_processor(image, self.transform) for image in images]
 86 |             image_features = torch.stack(image_features)
 87 | 
 88 |         if text is not None and images is not None:
 89 |             encoding["pixel_values"] = image_features
 90 |             return encoding
 91 |         elif text is not None:
 92 |             return encoding
 93 |         else:
 94 |             return {"pixel_values": image_features}
 95 | 
 96 |     def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
 97 |         """
 98 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
 99 |         refer to the docstring of this method for more information.
100 |         """
101 |         return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
102 | 
103 |     def decode(self, skip_special_tokens=True, *args, **kwargs):
104 |         """
105 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
106 |         the docstring of this method for more information.
107 |         """
108 |         return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
109 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/depth/tokenization_depth.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Depth": "https://huggingface.co/lb203/LanguageBind-Depth/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Depth": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Thermal": {},
27 | }
28 | 
29 | class LanguageBindDepthTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindDepthTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/image/processing_image.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from torchvision import transforms
 4 | from transformers import ProcessorMixin, BatchEncoding
 5 | from transformers.image_processing_utils import BatchFeature
 6 | 
 7 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 8 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
 9 | 
10 | def make_list_of_images(x):
11 |     if not isinstance(x, list):
12 |         return [x]
13 |     return x
14 | 
15 | def get_image_transform(config):
16 |     config = config.vision_config
17 |     transform = transforms.Compose(
18 |         [
19 |             transforms.ToTensor(),
20 |             transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
21 |             transforms.CenterCrop(224),
22 |             transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD)  # assume image
23 |         ]
24 |     )
25 |     return transform
26 | 
27 | 
28 | def load_and_transform_image(image_path, transform):
29 |     image = Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path
30 |     image_outputs = transform(image)
31 |     return image_outputs
32 | 
33 | class LanguageBindImageProcessor(ProcessorMixin):
34 |     attributes = []
35 |     tokenizer_class = ("LanguageBindImageTokenizer")
36 | 
37 |     def __init__(self, config, tokenizer=None, **kwargs):
38 |         super().__init__(**kwargs)
39 |         self.config = config
40 |         self.transform = get_image_transform(config)
41 |         self.image_processor = load_and_transform_image
42 |         self.tokenizer = tokenizer
43 |         self.image_mean = OPENAI_DATASET_MEAN
44 |         self.crop_size = {'height': 224, 'width': 224}
45 | 
46 |     def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
47 |         if text is None and images is None:
48 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
49 | 
50 |         if text is not None:
51 |             encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
52 |                                       truncation=True, return_tensors=return_tensors, **kwargs)
53 | 
54 |         if images is not None:
55 |             images = make_list_of_images(images)
56 |             image_features = [self.image_processor(image, self.transform) for image in images]
57 |             image_features = torch.stack(image_features)
58 | 
59 |         if text is not None and images is not None:
60 |             encoding["pixel_values"] = image_features
61 |             return encoding
62 |         elif text is not None:
63 |             return encoding
64 |         else:
65 |             return {"pixel_values": image_features}
66 | 
67 |     def preprocess(self, images, return_tensors):
68 |         return self.__call__(images=images, return_tensors=return_tensors)
69 | 
70 |     def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
71 |         """
72 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
73 |         refer to the docstring of this method for more information.
74 |         """
75 |         return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
76 | 
77 |     def decode(self, skip_special_tokens=True, *args, **kwargs):
78 |         """
79 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
80 |         the docstring of this method for more information.
81 |         """
82 |         return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
83 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/image/tokenization_image.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Image": "https://huggingface.co/lb203/LanguageBind-Image/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Image": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Image": {},
27 | }
28 | 
29 | class LanguageBindImageTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindImageTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/rmt_video/tokenization_video.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Video": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Video": {},
27 | }
28 | 
29 | class RMTLanguageBindVideoTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindVideoTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/thermal/processing_thermal.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from torchvision import transforms
 4 | from transformers import ProcessorMixin, BatchEncoding
 5 | from transformers.image_processing_utils import BatchFeature
 6 | 
 7 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 8 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
 9 | 
10 | def make_list_of_images(x):
11 |     if not isinstance(x, list):
12 |         return [x]
13 |     return x
14 | 
15 | def get_thermal_transform(config):
16 |     config = config.vision_config
17 |     transform = transforms.Compose(
18 |         [
19 |             transforms.ToTensor(),
20 |             transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
21 |             transforms.CenterCrop(224),
22 |             transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD)  # assume image
23 |         ]
24 |     )
25 |     return transform
26 | 
27 | 
28 | def load_and_transform_thermal(thermal_path, transform):
29 |     thermal = Image.open(thermal_path)
30 |     thermal_outputs = transform(thermal)
31 |     return thermal_outputs
32 | 
33 | class LanguageBindThermalProcessor(ProcessorMixin):
34 |     attributes = []
35 |     tokenizer_class = ("LanguageBindThermalTokenizer")
36 | 
37 |     def __init__(self, config, tokenizer=None, **kwargs):
38 |         super().__init__(**kwargs)
39 |         self.config = config
40 |         self.transform = get_thermal_transform(config)
41 |         self.image_processor = load_and_transform_thermal
42 |         self.tokenizer = tokenizer
43 | 
44 |     def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
45 |         if text is None and images is None:
46 |             raise ValueError("You have to specify either text or images. Both cannot be none.")
47 | 
48 |         if text is not None:
49 |             encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
50 |                                       truncation=True, return_tensors=return_tensors, **kwargs)
51 | 
52 |         if images is not None:
53 |             images = make_list_of_images(images)
54 |             image_features = [self.image_processor(image, self.transform) for image in images]
55 |             image_features = torch.stack(image_features)
56 | 
57 |         if text is not None and images is not None:
58 |             encoding["pixel_values"] = image_features
59 |             return encoding
60 |         elif text is not None:
61 |             return encoding
62 |         else:
63 |             return {"pixel_values": image_features}
64 | 
65 |     def batch_decode(self, skip_special_tokens=True, *args, **kwargs):
66 |         """
67 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
68 |         refer to the docstring of this method for more information.
69 |         """
70 |         return self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
71 | 
72 |     def decode(self, skip_special_tokens=True, *args, **kwargs):
73 |         """
74 |         This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
75 |         the docstring of this method for more information.
76 |         """
77 |         return self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
78 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/thermal/tokenization_thermal.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Thermal": "https://huggingface.co/lb203/LanguageBind-Thermal/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Thermal": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Thermal": {},
27 | }
28 | 
29 | class LanguageBindThermalTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindThermalTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/languagebind/video/tokenization_video.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPTokenizer
 2 | from transformers.utils import logging
 3 | 
 4 | logger = logging.get_logger(__name__)
 5 | 
 6 | VOCAB_FILES_NAMES = {
 7 |     "vocab_file": "vocab.json",
 8 |     "merges_file": "merges.txt",
 9 | }
10 | 
11 | PRETRAINED_VOCAB_FILES_MAP = {
12 |     "vocab_file": {
13 |         "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/vocab.json",
14 |     },
15 |     "merges_file": {
16 |         "lb203/LanguageBind-Video": "https://huggingface.co/lb203/LanguageBind-Video/resolve/main/merges.txt",
17 |     },
18 | }
19 | 
20 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
21 |     "lb203/LanguageBind-Video": 77,
22 | }
23 | 
24 | 
25 | PRETRAINED_INIT_CONFIGURATION = {
26 |     "lb203/LanguageBind-Video": {},
27 | }
28 | 
29 | class LanguageBindVideoTokenizer(CLIPTokenizer):
30 |     """
31 |     Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
32 | 
33 |     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34 |     this superclass for more information regarding those methods.
35 | 
36 |     Args:
37 |         vocab_file (`str`):
38 |             Path to the vocabulary file.
39 |         merges_file (`str`):
40 |             Path to the merges file.
41 |         errors (`str`, *optional*, defaults to `"replace"`):
42 |             Paradigm to follow when decoding bytes to UTF-8. See
43 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
44 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
45 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
46 |             token instead.
47 |         bos_token (`str`, *optional*, defaults to `<|startoftext|>`):
48 |             The beginning of sequence token.
49 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
50 |             The end of sequence token.
51 |     """
52 | 
53 |     vocab_files_names = VOCAB_FILES_NAMES
54 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |     model_input_names = ["input_ids", "attention_mask"]
57 | 
58 |     def __init__(
59 |             self,
60 |             vocab_file,
61 |             merges_file,
62 |             errors="replace",
63 |             unk_token="<|endoftext|>",
64 |             bos_token="<|startoftext|>",
65 |             eos_token="<|endoftext|>",
66 |             pad_token="<|endoftext|>",  # hack to enable padding
67 |             **kwargs,
68 |     ):
69 |         super(LanguageBindVideoTokenizer, self).__init__(
70 |             vocab_file,
71 |             merges_file,
72 |             errors,
73 |             unk_token,
74 |             bos_token,
75 |             eos_token,
76 |             pad_token,  # hack to enable padding
77 |             **kwargs,)


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/mae_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import ViTMAEModel, ViTMAEConfig, AutoImageProcessor
 5 | 
 6 | class ViTMAEVisionTower(nn.Module):
 7 |     def __init__(self, image_tower, args, delay_load=False):
 8 |         super().__init__()
 9 | 
10 |         self.is_loaded = False
11 | 
12 |         self.image_tower_name = image_tower
13 |         self.select_layer = args.mm_vision_select_layer
14 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
15 | 
16 |         if not delay_load:
17 |             self.load_model()
18 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
19 |             self.load_model()
20 |         else:
21 |             self.cfg_only = ViTMAEConfig.from_pretrained(self.image_tower_name)
22 | 
23 |     def load_model(self, device_map=None):
24 |         if self.is_loaded:
25 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
26 |             return
27 | 
28 |         self.image_processor = AutoImageProcessor.from_pretrained(self.image_tower_name)
29 |         self.image_tower = ViTMAEModel.from_pretrained(self.image_tower_name, device_map=device_map)
30 |         # self.image_tower = ViTMAEModel.from_pretrained(self.image_tower_name)
31 |         self.image_tower.requires_grad_(False)
32 | 
33 |         self.is_loaded = True
34 | 
35 |     def feature_select(self, image_forward_outs):
36 |         image_features = image_forward_outs.hidden_states[self.select_layer]
37 |         if self.select_feature == 'patch':
38 |             image_features = image_features[:, 1:]
39 |         elif self.select_feature == 'cls_patch':
40 |             image_features = image_features
41 |         else:
42 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
43 |         return image_features
44 | 
45 |     @torch.no_grad()
46 |     def forward(self, images):
47 |         if type(images) is list:
48 |             image_features = []
49 |             for image in images:
50 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
51 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
52 |                 image_features.append(image_feature)
53 |         else:
54 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
56 | 
57 |         return image_features
58 | 
59 |     @property
60 |     def dummy_feature(self):
61 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return self.image_tower.dtype
66 | 
67 |     @property
68 |     def device(self):
69 |         return self.image_tower.device
70 | 
71 |     @property
72 |     def config(self):
73 |         if self.is_loaded:
74 |             return self.image_tower.config
75 |         else:
76 |             return self.cfg_only
77 | 
78 |     @property
79 |     def hidden_size(self):
80 |         return self.config.hidden_size
81 | 
82 |     @property
83 |     def num_patches_per_side(self):
84 |         return self.config.image_size // self.config.patch_size
85 | 
86 |     @property
87 |     def num_patches(self):
88 |         return (self.config.image_size // self.config.patch_size) ** 2
89 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/rmt_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, image_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.image_tower_name = image_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 |         self.freeze_image_tower = getattr(args, 'freeze_image_tower', True)
17 | 
18 |         if not delay_load:
19 |             self.load_model()
20 |         else:
21 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.image_tower_name)
22 |             self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
23 |             self.image_tower = CLIPVisionModel(self.cfg_only)
24 | 
25 |     def load_model(self, device_map=None):
26 |         if self.is_loaded:
27 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
28 |             return
29 | 
30 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.image_tower_name)
31 |         self.image_tower = CLIPVisionModel.from_pretrained(self.image_tower_name, device_map=device_map)
32 |         if self.freeze_image_tower:
33 |             self.image_tower.requires_grad_(False)
34 | 
35 |         self.is_loaded = True
36 | 
37 |     def feature_select(self, image_forward_outs):
38 |         image_features = image_forward_outs.hidden_states[self.select_layer]
39 |         if self.select_feature == 'patch':
40 |             image_features = image_features[:, 1:]
41 |         elif self.select_feature == 'cls_patch':
42 |             image_features = image_features
43 |         else:
44 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
45 |         return image_features
46 | 
47 |     # @torch.no_grad()
48 |     def _forward(self, images):
49 |         if type(images) is list:
50 |             image_features = []
51 |             for image in images:
52 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
53 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
54 |                 image_features.append(image_feature)
55 |         else:
56 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
57 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
58 | 
59 |         return image_features
60 | 
61 |     def forward(self, images):
62 |         if self.freeze_image_tower:
63 |             with torch.no_grad():
64 |                 return self._forward(images)
65 |         else:
66 |             return self._forward(images)
67 | 
68 |     @property
69 |     def dummy_feature(self):
70 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
71 | 
72 |     @property
73 |     def dtype(self):
74 |         return self.image_tower.dtype
75 | 
76 |     @property
77 |     def device(self):
78 |         return self.image_tower.device
79 | 
80 |     @property
81 |     def config(self):
82 |         if self.is_loaded:
83 |             return self.image_tower.config
84 |         else:
85 |             return self.cfg_only
86 | 
87 |     @property
88 |     def hidden_size(self):
89 |         return self.config.hidden_size
90 | 
91 |     @property
92 |     def num_patches_per_side(self):
93 |         return self.config.image_size // self.config.patch_size
94 | 
95 |     @property
96 |     def num_patches(self):
97 |         return (self.config.image_size // self.config.patch_size) ** 2
98 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/rmt_vivit/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import VivitImageProcessor, VivitModel, VivitConfig
 5 | 
 6 | 
 7 | class VivitVisionTower(nn.Module):
 8 |     def __init__(self, video_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.video_tower_name = video_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = VivitConfig.from_pretrained(self.video_tower_name)
23 | 
24 |     def load_model(self, device_map=None):
25 |         if self.is_loaded:
26 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.video_tower_name))
27 |             return
28 | 
29 |         self.video_processor = VivitImageProcessor.from_pretrained(self.video_tower_name)
30 |         # self.video_tower = VivitModel.from_pretrained(self.video_tower_name, device_map=device_map)
31 |         self.video_tower = VivitModel.from_pretrained(self.video_tower_name)
32 |         self.video_tower.requires_grad_(False)
33 | 
34 |         self.is_loaded = True
35 | 
36 |     def feature_select(self, image_forward_outs):
37 |         image_features = image_forward_outs.hidden_states[self.select_layer]
38 |         if self.select_feature == 'patch':
39 |             image_features = image_features[:, 1:]
40 |             # mean pooling
41 |             image_features = image_features.view(image_features.shape[0], 196, image_features.shape[1]//196, image_features.shape[2])
42 |             image_features = image_features.mean(dim=2)
43 |         elif self.select_feature == 'cls_patch':
44 |             image_features = image_features
45 |         else:
46 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
47 |         return image_features
48 | 
49 |     @torch.no_grad()
50 |     def forward(self, images):
51 |         # expected images: batch_size, num_frames, num_channels, height, width
52 |         image_forward_outs = self.video_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
53 |         image_features = self.feature_select(image_forward_outs).to(images.dtype)
54 | 
55 |         return image_features
56 | 
57 |     @property
58 |     def dummy_feature(self):
59 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
60 | 
61 |     @property
62 |     def dtype(self):
63 |         return self.video_tower.dtype
64 | 
65 |     @property
66 |     def device(self):
67 |         return self.video_tower.device
68 | 
69 |     @property
70 |     def config(self):
71 |         if self.is_loaded:
72 |             return self.video_tower.config
73 |         else:
74 |             return self.cfg_only
75 | 
76 |     @property
77 |     def hidden_size(self):
78 |         return self.config.hidden_size
79 | 
80 |     @property
81 |     def num_patches_per_side(self):
82 |         return self.config.image_size // self.config.patch_size
83 | 
84 |     @property
85 |     def num_patches(self):
86 |         return (self.config.image_size // self.config.patch_size) ** 2
87 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/videomae_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import VideoMAEImageProcessor, VideoMAEModel, VideoMAEConfig
 5 | 
 6 | 
 7 | class VideoMAEVisionTower(nn.Module):
 8 |     def __init__(self, video_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.video_tower_name = video_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = VideoMAEConfig.from_pretrained(self.video_tower_name)
23 | 
24 |     def load_model(self, device_map=None):
25 |         if self.is_loaded:
26 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.video_tower_name))
27 |             return
28 | 
29 |         self.video_processor = VideoMAEImageProcessor.from_pretrained(self.video_tower_name)
30 |         # self.video_tower = VideoMAEModel.from_pretrained(self.video_tower_name, device_map=device_map)
31 |         self.video_tower = VideoMAEModel.from_pretrained(self.video_tower_name)
32 |         self.video_tower.requires_grad_(False)
33 | 
34 |         self.is_loaded = True
35 | 
36 |     def feature_select(self, image_forward_outs):
37 |         image_features = image_forward_outs.hidden_states[self.select_layer]
38 |         if self.select_feature == 'patch':
39 |             # image_features = image_features[:, 1:]
40 |             # mean pooling
41 |             image_features = image_features.view(image_features.shape[0], 196, image_features.shape[1]//196, image_features.shape[2])
42 |             image_features = image_features.mean(dim=2)
43 |         elif self.select_feature == 'cls_patch':
44 |             raise ValueError(f"Unimplemented select feature: {self.select_feature}")
45 |             image_features = image_features
46 |         else:
47 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
48 |         return image_features
49 | 
50 |     @torch.no_grad()
51 |     def forward(self, images):
52 |         # expected images: batch_size, num_frames, num_channels, height, width
53 |         image_forward_outs = self.video_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
54 |         image_features = self.feature_select(image_forward_outs).to(images.dtype)
55 | 
56 |         return image_features
57 | 
58 |     @property
59 |     def dummy_feature(self):
60 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
61 | 
62 |     @property
63 |     def dtype(self):
64 |         return self.video_tower.dtype
65 | 
66 |     @property
67 |     def device(self):
68 |         return self.video_tower.device
69 | 
70 |     @property
71 |     def config(self):
72 |         if self.is_loaded:
73 |             return self.video_tower.config
74 |         else:
75 |             return self.cfg_only
76 | 
77 |     @property
78 |     def hidden_size(self):
79 |         return self.config.hidden_size
80 | 
81 |     @property
82 |     def num_patches_per_side(self):
83 |         return self.config.image_size // self.config.patch_size
84 | 
85 |     @property
86 |     def num_patches(self):
87 |         return (self.config.image_size // self.config.patch_size) ** 2
88 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/vit_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import ViTModel, ViTImageProcessor, ViTConfig
 5 | 
 6 | class ViTVisionTower(nn.Module):
 7 |     def __init__(self, image_tower, args, delay_load=False):
 8 |         super().__init__()
 9 | 
10 |         self.is_loaded = False
11 | 
12 |         self.image_tower_name = image_tower
13 |         self.select_layer = args.mm_vision_select_layer
14 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
15 | 
16 |         if not delay_load:
17 |             self.load_model()
18 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
19 |             self.load_model()
20 |         else:
21 |             self.cfg_only = ViTConfig.from_pretrained(self.image_tower_name)
22 | 
23 |     def load_model(self, device_map=None):
24 |         if self.is_loaded:
25 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.image_tower_name))
26 |             return
27 | 
28 |         self.image_processor = ViTImageProcessor.from_pretrained(self.image_tower_name)
29 |         # self.image_tower = ViTModel.from_pretrained(self.image_tower_name, device_map=device_map)
30 |         self.image_tower = ViTModel.from_pretrained(self.image_tower_name)
31 |         self.image_tower.requires_grad_(False)
32 | 
33 |         self.is_loaded = True
34 | 
35 |     def feature_select(self, image_forward_outs):
36 |         image_features = image_forward_outs.hidden_states[self.select_layer]
37 |         if self.select_feature == 'patch':
38 |             image_features = image_features[:, 1:]
39 |         elif self.select_feature == 'cls_patch':
40 |             image_features = image_features
41 |         else:
42 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
43 |         return image_features
44 | 
45 |     @torch.no_grad()
46 |     def forward(self, images):
47 |         if type(images) is list:
48 |             image_features = []
49 |             for image in images:
50 |                 image_forward_out = self.image_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
51 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
52 |                 image_features.append(image_feature)
53 |         else:
54 |             image_forward_outs = self.image_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
56 | 
57 |         return image_features
58 | 
59 |     @property
60 |     def dummy_feature(self):
61 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return self.image_tower.dtype
66 | 
67 |     @property
68 |     def device(self):
69 |         return self.image_tower.device
70 | 
71 |     @property
72 |     def config(self):
73 |         if self.is_loaded:
74 |             return self.image_tower.config
75 |         else:
76 |             return self.cfg_only
77 | 
78 |     @property
79 |     def hidden_size(self):
80 |         return self.config.hidden_size
81 | 
82 |     @property
83 |     def num_patches_per_side(self):
84 |         return self.config.image_size // self.config.patch_size
85 | 
86 |     @property
87 |     def num_patches(self):
88 |         return (self.config.image_size // self.config.patch_size) ** 2
89 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/vivit_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import VivitImageProcessor, VivitModel, VivitConfig
 5 | 
 6 | 
 7 | class VivitVisionTower(nn.Module):
 8 |     def __init__(self, video_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.video_tower_name = video_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
20 |             self.load_model()
21 |         else:
22 |             self.cfg_only = VivitConfig.from_pretrained(self.video_tower_name)
23 | 
24 |     def load_model(self, device_map=None):
25 |         if self.is_loaded:
26 |             print('{} is already loaded, `load_model` called again, skipping.'.format(self.video_tower_name))
27 |             return
28 | 
29 |         self.video_processor = VivitImageProcessor.from_pretrained(self.video_tower_name)
30 |         # self.video_tower = VivitModel.from_pretrained(self.video_tower_name, device_map=device_map)
31 |         self.video_tower = VivitModel.from_pretrained(self.video_tower_name)
32 |         self.video_tower.requires_grad_(False)
33 | 
34 |         self.is_loaded = True
35 | 
36 |     def feature_select(self, image_forward_outs):
37 |         image_features = image_forward_outs.hidden_states[self.select_layer]
38 |         if self.select_feature == 'patch':
39 |             image_features = image_features[:, 1:]
40 |             # mean pooling
41 |             image_features = image_features.view(image_features.shape[0], 196, image_features.shape[1]//196, image_features.shape[2])
42 |             image_features = image_features.mean(dim=2)
43 |         elif self.select_feature == 'cls_patch':
44 |             image_features = image_features
45 |         else:
46 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
47 |         return image_features
48 | 
49 |     @torch.no_grad()
50 |     def forward(self, images):
51 |         # expected images: batch_size, num_frames, num_channels, height, width
52 |         image_forward_outs = self.video_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
53 |         image_features = self.feature_select(image_forward_outs).to(images.dtype)
54 | 
55 |         return image_features
56 | 
57 |     @property
58 |     def dummy_feature(self):
59 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
60 | 
61 |     @property
62 |     def dtype(self):
63 |         return self.video_tower.dtype
64 | 
65 |     @property
66 |     def device(self):
67 |         return self.video_tower.device
68 | 
69 |     @property
70 |     def config(self):
71 |         if self.is_loaded:
72 |             return self.video_tower.config
73 |         else:
74 |             return self.cfg_only
75 | 
76 |     @property
77 |     def hidden_size(self):
78 |         return self.config.hidden_size
79 | 
80 |     @property
81 |     def num_patches_per_side(self):
82 |         return self.config.image_size // self.config.patch_size
83 | 
84 |     @property
85 |     def num_patches(self):
86 |         return (self.config.image_size // self.config.patch_size) ** 2
87 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | from .identity_projector import IdentityProjector
 6 | from .mlp_projector import MLPProjector
 7 | from .qformer_projector import qformer_config_template, Blip2Model
 8 | from .transformer_projector import TransformerProjector
 9 | from .mlp_transformer_projector import MLPTransformerProjector
10 | from .rmt_transformer_projector import RMTTransformerProjector
11 | from .rmt_r_transformer_projector import RMTRTransformerProjector
12 | 
13 | def build_vision_projector(config, delay_load=False, **kwargs):
14 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
15 | 
16 |     # three types of projector: linear, typical FFN, identity
17 | 
18 |     if projector_type == 'identity':
19 |         return IdentityProjector()
20 |     elif projector_type == 'linear':
21 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
22 |     elif projector_type == 'pool':
23 |         pass # TODO
24 |     elif 'mlp_transformer' in projector_type:
25 |         mlp_transformer_layer = re.match(r'^mlp_transformer(\d+)x', projector_type)
26 |         mlp_transformer_depth = int(mlp_transformer_layer.group(1))
27 |         return MLPTransformerProjector(config, mlp_transformer_depth)
28 |     elif 'rmt_transformer' in projector_type:
29 |         rmt_transformer_layer = re.match(r'^rmt_transformer(\d+)x', projector_type)
30 |         rmt_transformer_depth = int(rmt_transformer_layer.group(1))
31 |         return RMTTransformerProjector(config, rmt_transformer_depth)
32 |     elif 'rmt_r_transformer' in projector_type:
33 |         rmt_r_transformer_layer = re.match(r'^rmt_r_transformer(\d+)x', projector_type)
34 |         rmt_r_transformer_depth = int(rmt_r_transformer_layer.group(1))
35 |         return RMTRTransformerProjector(config, rmt_r_transformer_depth)
36 |     elif 'transformer' in projector_type:
37 |         transformer_layer = re.match(r'^transformer(\d+)x', projector_type)
38 |         transformer_depth = int(transformer_layer.group(1))
39 |         return TransformerProjector(config, transformer_depth)
40 |     elif 'mlp' in projector_type:
41 |         mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
42 |         mlp_depth = int(mlp_gelu_match.group(1))
43 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
44 |         for _ in range(1, mlp_depth):
45 |             modules.append(nn.GELU())
46 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
47 |         return nn.Sequential(*modules)
48 |         # return MLPProjector(config, mlp_depth)
49 |     elif 'qformer' in projector_type:
50 |         qformer_config = qformer_config_template(config, projector_type)
51 |         return Blip2Model(qformer_config)
52 | 
53 |     raise ValueError(f'Unknown projector type: {projector_type}')
54 | 
55 | def build_vision_resampler(config, delay_load=False, **kwargs):
56 |     # refer to qformer in blip2 or resampler in flanmingo
57 |     pass # TODO
58 | 
59 | def build_vision_vqgan(config, delay_load=False, **kwargs):
60 |     # refer to lavit
61 |     pass # TODO
62 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/identity_projector.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | class IdentityProjector(nn.Module):
 4 |     def __init__(self):
 5 |         super().__init__()
 6 | 
 7 |     def forward(self, x, *args, **kwargs):
 8 |         return x
 9 | 
10 |     @property
11 |     def config(self):
12 |         return {"mm_projector_type": 'identity'}


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/mlp_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | class MLPProjector(nn.Module):
 5 |     def __init__(self, config, depth):
 6 |         super().__init__()
 7 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
 8 |         for _ in range(1, depth):
 9 |             modules.append(nn.GELU())
10 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
11 |         self.mlp = nn.Sequential(*modules)
12 |     def forward(self, hidden_states):
13 |         return self.mlp(hidden_states)


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/self_segment.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | def cal_depth_score(sim_scores):
  4 |     n = sim_scores.shape[0]
  5 |     depth_scores = torch.zeros(sim_scores.size(), dtype=sim_scores.dtype, device=sim_scores.device)
  6 |     # clip = min(max(n//10, 2), 5) # adopt clip to improve efficiency
  7 |     for i in range(n):
  8 |         lpeak = sim_scores[i]
  9 |         for li in range(i-1, -1, -1):
 10 |             if sim_scores[li] >= lpeak:
 11 |                 lpeak = sim_scores[li]
 12 |             else:
 13 |                 break
 14 |         rpeak = sim_scores[i]
 15 |         for ri in range(i+1, n):
 16 |             if sim_scores[ri] >= rpeak:
 17 |                 rpeak = sim_scores[ri]
 18 |             else:
 19 |                 break
 20 |         depth_scores[i] = lpeak + rpeak - 2 * sim_scores[i]
 21 |     return depth_scores
 22 | 
 23 | 
 24 | def segment(features, alpha=0.5, k=None):
 25 |     # input shape: t, d
 26 |     sim_scores = torch.cosine_similarity(features[:-1, :], features[1:, :])
 27 |     depth_scores = cal_depth_score(sim_scores)
 28 | 
 29 |     if k is not None:
 30 |         # select by top k
 31 |         boundaries = torch.topk(depth_scores, k).indices.sort()[0]
 32 |     else:
 33 |         # select by threshold (original)
 34 |         std, mean = torch.std_mean(depth_scores)
 35 |         thresh = mean + alpha * std
 36 |         condition = depth_scores > thresh
 37 |         boundaries = condition.nonzero().squeeze(-1)
 38 |         if len(boundaries) > 15: #TODO: limit max segments to prevent from OOM: 7 comes from RMT paper
 39 |             boundaries = torch.topk(depth_scores, 15).indices.sort()[0]
 40 |     
 41 |     boundaries = boundaries.tolist()
 42 |     
 43 |     # print("boudaries: ", boundaries)
 44 |     # print("features: ", features)
 45 |     
 46 |     if type(boundaries) == int or boundaries == [] or boundaries[-1] != features.shape[0]-1:
 47 |         boundaries.append(features.shape[0]-1)
 48 | 
 49 |     # # average segment
 50 |     # l = features.shape[0]
 51 |     # boundaires = list(range(l//(k+1)-1, l, l//(k+1)))
 52 |     
 53 |     # segments = []
 54 |     # index = 0
 55 |     # for bi in boundaries:
 56 |     #     segments.append(features[index: bi+1])
 57 |     #     index = bi + 1
 58 |     # if index < features.shape[0]: segments.append(features[index:])
 59 | 
 60 |     return boundaries
 61 | 
 62 | def cal_left_depth_score(sim_scores):
 63 |     n = sim_scores.shape[0]
 64 |     depth_scores = torch.zeros(sim_scores.size(), dtype=sim_scores.dtype, device=sim_scores.device)
 65 |     # clip = min(max(n//10, 2), 5) # adopt clip to improve efficiency
 66 |     for i in range(n):
 67 |         lpeak = sim_scores[i]
 68 |         for li in range(i-1, -1, -1):
 69 |             if sim_scores[li] >= lpeak:
 70 |                 lpeak = sim_scores[li]
 71 |             else:
 72 |                 break
 73 |         depth_scores[i] = lpeak - sim_scores[i]
 74 |     return depth_scores
 75 | 
 76 | 
 77 | def segment_left(features, alpha=0.5, k=None):
 78 |     # input shape: t, d
 79 |     sim_scores = torch.cosine_similarity(features[:-1, :], features[1:, :])
 80 |     depth_scores = cal_left_depth_score(sim_scores)
 81 |     
 82 |     # print(depth_scores)
 83 | 
 84 |     if k is not None:
 85 |         # select by top k
 86 |         boundaries = torch.topk(depth_scores, k).indices.sort()[0]
 87 |     else:
 88 |         # select by threshold (original)
 89 |         std, mean = torch.std_mean(depth_scores)
 90 |         thresh = mean + alpha * std
 91 |         condition = depth_scores > thresh
 92 |         boundaries = condition.nonzero().squeeze(-1)
 93 |         # if len(boundaries) > 15: #TODO: limit max segments to prevent from OOM: 7 comes from RMT paper
 94 |         #     boundaries = torch.topk(depth_scores, 15).indices.sort()[0]
 95 |     
 96 |     boundaries = boundaries.tolist()
 97 |     
 98 |     # print("boudaries: ", boundaries)
 99 |     # print("features: ", features)
100 |     
101 |     # if type(boundaries) == int or boundaries == [] or boundaries[-1] != features.shape[0]-1:
102 |     #     boundaries.append(features.shape[0]-1)
103 |     if type(boundaries) == int or boundaries == []:
104 |         boundaries.append(features.shape[0]-1)
105 |         
106 |         
107 |     # # average segment
108 |     # l = features.shape[0]
109 |     # boundaires = list(range(l//(k+1)-1, l, l//(k+1)))
110 |     
111 |     # segments = []
112 |     # index = 0
113 |     # for bi in boundaries:
114 |     #     segments.append(features[index: bi+1])
115 |     #     index = bi + 1
116 |     # if index < features.shape[0]: segments.append(features[index:])
117 | 
118 |     return boundaries
119 | 
120 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/spatial_pool_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | 
 6 | class SpatialPool(nn.Module):
 7 |     def __init__(self, model_args, vision_tower):
 8 |         super().__init__()
 9 | 
10 |         self.mode = model_args.mm_spatial_pool_mode
11 |         self.stride = model_args.mm_spatial_pool_stride
12 |         # import pdb; pdb.set_trace()
13 |         self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
14 | 
15 |         if self.mode == "adaptive":
16 |             self
17 |             self.pool = nn.AdaptiveAvgPool3d()
18 |         elif self.mode == "average":
19 |             self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
20 |         elif self.mode == "max":
21 |             self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
22 |         elif self.mode == "conv":
23 |             self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
24 |         else:
25 |             raise ValueError(f"Unknown pooling mode: {self.pool}.")
26 | 
27 |     def forward(self, image_features, images, *args, **kwargs):
28 |         # import pdb; pdb.set_trace()
29 |         # image_features: b * t, n, c 
30 |         # image: b, c, h, w
31 |         # w: sqrt(n * w // h)
32 |         # h: sqrt(n * w // h) * h // w    -->     sqrt(n * h // w)
33 |         ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
34 |         ori_H = int(ori_W * images.shape[2] // images.shape[3])
35 | 
36 |         B, _, F = image_features.shape
37 | 
38 |         image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
39 |         image_features_spatial_pool = self.pool(image_features_spatial)
40 | 
41 |         return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
42 | 
43 |     @property
44 |     def config(self):
45 |         return {
46 |             "mm_resampler_type": "spatial_pool",
47 |             "mm_spatial_pool_stride": self.stride,
48 |             "mm_spatial_pool_mode": self.mode,
49 |             "mm_spatial_pool_out_channels": self.out_channels,
50 |         }
51 | 
52 |     @property
53 |     def hidden_size(self):
54 |         return self.out_channels


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/arguments_live.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from transformers import TrainingArguments
 3 | from transformers import HfArgumentParser
 4 | 
 5 | @dataclass
 6 | class LiveTrainingArguments(TrainingArguments):
 7 |     live_version: str = 'live1+'
 8 |     system_prompt: str = (
 9 |         "A multimodal AI assistant is helping users with some activities."
10 |         " Below is their conversation, interleaved with the list of video frames received by the assistant."
11 |     )
12 |     train_datasets: list[str] = None
13 |     eval_datasets: list[str] = None
14 |     stream_loss_weight: float = 1.0
15 |     llm_pretrained: str = 'meta-llama/Meta-Llama-3-8B-Instruct'
16 |     vision_pretrained: str = 'google/siglip-large-patch16-384'
17 |     lora_modules: str = "model.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)|lm_head$"
18 |     lora_r: int = 128
19 |     lora_alpha: int = 256
20 |     finetune_modules: list[str] = field(default_factory=lambda: ['connector'])
21 |     frame_fps: int = 1 # for training. inference can be 10
22 |     frame_token_cls: bool = None
23 |     frame_token_pooled: list[int] = None
24 |     frame_resolution: int = 384
25 |     frame_token_interval: str  = None
26 |     frame_token_interval_threshold: float = 0.0
27 |     augmentation: bool = False
28 |     attn_implementation: str = 'flash_attention_2'
29 |     output_dir: str = 'outputs/debug'
30 |     
31 |     # VideoLLaMB parameters
32 |     model_path: str = "facebook/opt-350m"
33 |     model_base: str = None
34 |     video_file: str = None
35 |     # device: str = "cuda"
36 |     conv_mode: str = None
37 |     num_frames: float = 8
38 |     temperature: float = 0.2
39 |     max_new_tokens: int = 512
40 |     load_8bit: bool = None
41 |     load_4bit: bool = None
42 |     
43 | 
44 | @dataclass
45 | class LiveOneTrainingArguments(LiveTrainingArguments):
46 |     live_version: str = 'live1'
47 |     frame_token_cls: bool = True
48 |     frame_num_tokens: int = 1
49 |     frame_token_interval: str  = ''
50 |     embed_mark: str = '2fps_384_1'
51 |     max_num_frames: int = 7200 # 1h, 2fps, 7200 frames
52 | 
53 | @dataclass
54 | class LiveOnePlusTrainingArguments(LiveTrainingArguments):
55 |     live_version: str = 'live1+'
56 |     frame_token_cls: bool = True
57 |     frame_token_pooled: list[int] = field(default_factory=lambda: [3,3])
58 |     frame_num_tokens: int = 10 # 1+3x3
59 |     embed_mark: str = '2fps_384_1+3x3'
60 |     frame_token_interval: str = ','
61 |     max_num_frames: int = 1200 # 10min, 2fps, 1200 frames
62 | 
63 | def get_args_class(live_version: str):
64 |     if live_version == 'live1':
65 |         return LiveOneTrainingArguments
66 |     elif live_version == 'live1+':
67 |         return LiveOnePlusTrainingArguments
68 |     raise NotImplementedError
69 | 
70 | def parse_args() -> LiveTrainingArguments:
71 |     args, = HfArgumentParser(LiveTrainingArguments).parse_args_into_dataclasses()
72 |     args, = HfArgumentParser(get_args_class(args.live_version)).parse_args_into_dataclasses()
73 |     return args


--------------------------------------------------------------------------------
/llava/serve/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import requests
  3 | from io import BytesIO
  4 | 
  5 | import torch
  6 | from transformers import TextStreamer
  7 | 
  8 | from llava.conversation import conv_templates, SeparatorStyle
  9 | from llava.constants import DEFAULT_X_START_TOKEN, DEFAULT_X_TOKEN, DEFAULT_X_END_TOKEN, X_TOKEN_INDEX
 10 | from llava.utils import disable_torch_init
 11 | from llava.mm_utils import get_model_name_from_path, tokenizer_x_token, KeywordsStoppingCriteria
 12 | from llava.vid_utils import read_videos
 13 | from llava.model.builder import load_pretrained_model
 14 | from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
 15 | from llava.train.train import smart_tokenizer_and_embedding_resize
 16 | 
 17 | from PIL import Image
 18 | from decord import VideoReader, cpu
 19 | import decord
 20 | decord.bridge.set_bridge('torch')
 21 | 
 22 | 
 23 | 
 24 | 
 25 | def main(args):
 26 |     # Model
 27 |     disable_torch_init()
 28 | 
 29 |     model_name = get_model_name_from_path(args.model_path)
 30 |     tokenizer, model, processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.num_frames, args.load_8bit, args.load_4bit, device=args.device)
 31 | 
 32 |     if "llama-2" in model_name.lower():
 33 |         conv_mode = "llava_llama_2"
 34 |     elif "mistral" in model_name.lower():
 35 |         conv_mode = "mistral_instruct"
 36 |     elif "v1.6-34b" in model_name.lower():
 37 |         conv_mode = "chatml_direct"
 38 |     elif "v1" in model_name.lower():
 39 |         conv_mode = "llava_v1"
 40 |     elif "mpt" in model_name.lower():
 41 |         conv_mode = "mpt"
 42 |     else:
 43 |         conv_mode = "llava_v0"
 44 | 
 45 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 46 |         print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
 47 |     else:
 48 |         args.conv_mode = conv_mode
 49 | 
 50 |     conv = conv_templates[args.conv_mode].copy()
 51 |     if "mpt" in model_name.lower():
 52 |         roles = ('user', 'assistant')
 53 |     else:
 54 |         roles = conv.roles
 55 | 
 56 |     video_tensor = processor['VIDEO'](args.video_file, return_tensors='pt')['pixel_values'][0].half().to(args.device)
 57 |     
 58 |     fid = True
 59 |     while True:
 60 |         try:
 61 |             inp = input(f"{roles[0]}: ")
 62 |         except EOFError:
 63 |             inp = ""
 64 |         if not inp:
 65 |             print("exit...")
 66 |             break
 67 | 
 68 |         print(f"{roles[1]}: ", end="")
 69 | 
 70 |         
 71 |         if fid:
 72 |             if model.config.mm_use_x_start_end:
 73 |                 inp = DEFAULT_X_START_TOKEN['VIDEO'] + DEFAULT_X_TOKEN['VIDEO'] + DEFAULT_X_END_TOKEN['VIDEO'] + '\n' + inp
 74 |             else:
 75 |                 inp = DEFAULT_X_TOKEN['VIDEO'] + '\n' + inp
 76 |             fid = False
 77 |         conv.append_message(conv.roles[0], inp)
 78 |         conv.append_message(conv.roles[1], None)
 79 |         prompt = conv.get_prompt()
 80 | 
 81 |         # TODO
 82 |         input_ids = tokenizer_x_token(prompt, tokenizer, X_TOKEN_INDEX['VIDEO'], return_tensors='pt').unsqueeze(0).to(args.device)
 83 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 84 |         keywords = [stop_str]
 85 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 86 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 87 |         with torch.inference_mode():
 88 |             output_ids = model.generate(
 89 |                 input_ids,
 90 |                 X=[video_tensor],
 91 |                 X_modalities=["VIDEO"],
 92 |                 X_sizes=[None],
 93 |                 do_sample=True,
 94 |                 temperature=0.2,
 95 |                 max_new_tokens=1024,
 96 |                 use_cache=True,
 97 |                 # streamer=streamer,
 98 |                 cache_position=None,
 99 |                 stopping_criteria=[stopping_criteria])
100 |         
101 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
102 |         outputs = outputs.strip()
103 |         if outputs.endswith(stop_str):
104 |             outputs = outputs[:-len(stop_str)]
105 |         outputs = outputs.strip()
106 | 
107 |         
108 |         if args.debug:
109 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
110 |         else:
111 |             print(outputs)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
117 |     parser.add_argument("--model-base", type=str, default=None)
118 |     parser.add_argument("--video-file", type=str, required=True)
119 |     parser.add_argument("--device", type=str, default="cuda")
120 |     parser.add_argument("--conv-mode", type=str, default=None)
121 |     parser.add_argument("--num-frames", type=float, default=8)
122 |     parser.add_argument("--temperature", type=float, default=0.2)
123 |     parser.add_argument("--max-new-tokens", type=int, default=512)
124 |     parser.add_argument("--load-8bit", action="store_true")
125 |     parser.add_argument("--load-4bit", action="store_true")
126 |     parser.add_argument("--debug", action="store_true")
127 |     args = parser.parse_args()
128 |     main(args)
129 | 


--------------------------------------------------------------------------------
/llava/serve/cli_streaming.py:
--------------------------------------------------------------------------------
 1 | import os, torchvision, transformers, tqdm, time, json, subprocess
 2 | import torch.multiprocessing as mp
 3 | # torchvision.set_video_backend('video_reader')
 4 | 
 5 | # from data.utils import ffmpeg_once
 6 | 
 7 | from .inference import LiveInfer
 8 | logger = transformers.logging.get_logger('liveinfer')
 9 | 
10 | # python -m demo.cli --resume_from_checkpoint ... 
11 | 
12 | def ffmpeg_once(src_path: str, dst_path: str, *, fps: int = None, resolution: int = None, pad: str = '#000000', mode='bicubic'):
13 |     os.makedirs(os.path.dirname(dst_path), exist_ok=True)
14 |     command = [
15 |         'ffmpeg',
16 |         '-y',
17 |         '-sws_flags', mode,
18 |         '-i', src_path,
19 |         '-an',
20 |         '-threads', '10',
21 |     ]
22 |     if fps is not None:
23 |         command += ['-r', str(fps)]
24 |     if resolution is not None:
25 |         command += ['-vf', f"scale='if(gt(iw\\,ih)\\,{resolution}\\,-2)':'if(gt(iw\\,ih)\\,-2\\,{resolution})',pad={resolution}:{resolution}:(ow-iw)/2:(oh-ih)/2:color='{pad}'"]
26 |     command += [dst_path]
27 |     subprocess.run(command, check=True)
28 | 
29 | def main(liveinfer: LiveInfer):
30 |     src_video_path = 'llava/serve/examples/videos/dance.mp4'
31 |     name, ext = os.path.splitext(src_video_path)
32 |     ffmpeg_video_path = os.path.join('llava/serve/examples/videos/cache', name + f'_{liveinfer.frame_fps}fps' + ext)
33 |     save_history_path = src_video_path.replace('.mp4', '.json')
34 |     if not os.path.exists(ffmpeg_video_path):
35 |         os.makedirs(os.path.dirname(ffmpeg_video_path), exist_ok=True)
36 |         ffmpeg_once(src_video_path, ffmpeg_video_path, fps=liveinfer.frame_fps)
37 |         logger.warning(f'{src_video_path} -> {ffmpeg_video_path}, {liveinfer.frame_fps} FPS')
38 |     
39 |     liveinfer.load_videos(ffmpeg_video_path)
40 |     liveinfer.input_query_stream('Describe the video in one sentence.', video_time=0.0)
41 | 
42 |     timecosts = []
43 |     pbar = tqdm.tqdm(total=liveinfer.num_video_frames, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}{postfix}]")
44 |     history = {'video_path': src_video_path, 'frame_fps': liveinfer.frame_fps, 'conversation': []} 
45 |     for i in range(liveinfer.num_video_frames):
46 |         start_time = time.time()
47 |         liveinfer.input_video_stream(i / liveinfer.frame_fps)
48 |         query, response = liveinfer()
49 |         end_time = time.time()
50 |         timecosts.append(end_time - start_time)
51 |         time.sleep(1) # FIXME: for demonstrate demo
52 |         fps = (i + 1) / sum(timecosts)
53 |         # pbar.set_postfix_str(f"Average Processing FPS: {fps:.1f}")
54 |         # pbar.update(1)
55 |         if query:
56 |             history['conversation'].append({'role': 'user', 'content': query, 'time': liveinfer.video_time, 'fps': fps, 'cost': timecosts[-1]})
57 |             print(query)
58 |         if response:
59 |             history['conversation'].append({'role': 'assistant', 'content': response, 'time': liveinfer.video_time, 'fps': fps, 'cost': timecosts[-1]})
60 |             print(response)
61 |         if not query and not response:
62 |             history['conversation'].append({'time': liveinfer.video_time, 'fps': fps, 'cost': timecosts[-1]})
63 |     json.dump(history, open(save_history_path, 'w'), indent=4)
64 |     print(f'The conversation history has been saved to {save_history_path}.')
65 | 
66 | if __name__ == '__main__':
67 |     liveinfer = LiveInfer()
68 |     main(liveinfer)


--------------------------------------------------------------------------------
/llava/serve/examples/desert.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/desert.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_1.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_13.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_13.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_22.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_22.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_3.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_8.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_8.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_demo_9.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_demo_9.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/sample_img_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_img_13.png


--------------------------------------------------------------------------------
/llava/serve/examples/sample_img_22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_img_22.png


--------------------------------------------------------------------------------
/llava/serve/examples/sample_img_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/sample_img_8.png


--------------------------------------------------------------------------------
/llava/serve/examples/videos/cache/llava/serve/examples/videos/dance_1fps.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/videos/cache/llava/serve/examples/videos/dance_1fps.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/videos/dance.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "video_path": "llava/serve/examples/videos/dance.mp4",
  3 |     "frame_fps": 1,
  4 |     "conversation": [
  5 |         {
  6 |             "time": 0.0,
  7 |             "fps": 8.71882476931296,
  8 |             "cost": 0.11469435691833496
  9 |         },
 10 |         {
 11 |             "role": "user",
 12 |             "content": "(Video Time = 0.0s) User: Describe the video in one sentence.",
 13 |             "time": 1.0,
 14 |             "fps": 2.7397644326097175,
 15 |             "cost": 0.61529541015625
 16 |         },
 17 |         {
 18 |             "role": "assistant",
 19 |             "content": "(Video Time = 0.0s) Assistant:A man in a black jacket and black hat is standing in the middle of the street.",
 20 |             "time": 1.0,
 21 |             "fps": 2.7397644326097175,
 22 |             "cost": 0.61529541015625
 23 |         },
 24 |         {
 25 |             "time": 2.0,
 26 |             "fps": 4.029977632074529,
 27 |             "cost": 0.014431238174438477
 28 |         },
 29 |         {
 30 |             "time": 3.0,
 31 |             "fps": 5.2740479753945895,
 32 |             "cost": 0.014009714126586914
 33 |         },
 34 |         {
 35 |             "time": 4.0,
 36 |             "fps": 6.468105790431907,
 37 |             "cost": 0.014593124389648438
 38 |         },
 39 |         {
 40 |             "time": 5.0,
 41 |             "fps": 7.612335057610165,
 42 |             "cost": 0.015170574188232422
 43 |         },
 44 |         {
 45 |             "time": 6.0,
 46 |             "fps": 8.723153394319873,
 47 |             "cost": 0.014267683029174805
 48 |         },
 49 |         {
 50 |             "time": 7.0,
 51 |             "fps": 9.755051751636335,
 52 |             "cost": 0.017625808715820312
 53 |         },
 54 |         {
 55 |             "role": "user",
 56 |             "content": "(Video Time = 8.0s) User: None",
 57 |             "time": 8.0,
 58 |             "fps": 6.408141855038394,
 59 |             "cost": 0.5843753814697266
 60 |         },
 61 |         {
 62 |             "role": "assistant",
 63 |             "content": "(Video Time = 8.0s) Assistant:The man playing the guitar is wearing a black cap and a red and white striped shirt.",
 64 |             "time": 8.0,
 65 |             "fps": 6.408141855038394,
 66 |             "cost": 0.5843753814697266
 67 |         },
 68 |         {
 69 |             "time": 9.0,
 70 |             "fps": 7.043273699343196,
 71 |             "cost": 0.015331029891967773
 72 |         },
 73 |         {
 74 |             "role": "user",
 75 |             "content": "(Video Time = 10.0s) User: None",
 76 |             "time": 10.0,
 77 |             "fps": 5.928513228579957,
 78 |             "cost": 0.43564558029174805
 79 |         },
 80 |         {
 81 |             "role": "assistant",
 82 |             "content": "(Video Time = 10.0s) Assistant:A man is playing a guitar while walking down the street.",
 83 |             "time": 10.0,
 84 |             "fps": 5.928513228579957,
 85 |             "cost": 0.43564558029174805
 86 |         },
 87 |         {
 88 |             "time": 11.0,
 89 |             "fps": 6.415124361774656,
 90 |             "cost": 0.015139579772949219
 91 |         },
 92 |         {
 93 |             "role": "user",
 94 |             "content": "(Video Time = 12.0s) User: None",
 95 |             "time": 12.0,
 96 |             "fps": 5.646708132785059,
 97 |             "cost": 0.43164682388305664
 98 |         },
 99 |         {
100 |             "role": "assistant",
101 |             "content": "(Video Time = 12.0s) Assistant:The band is playing in front of the store.",
102 |             "time": 12.0,
103 |             "fps": 5.646708132785059,
104 |             "cost": 0.43164682388305664
105 |         },
106 |         {
107 |             "time": 13.0,
108 |             "fps": 6.037906825591524,
109 |             "cost": 0.01645803451538086
110 |         },
111 |         {
112 |             "role": "user",
113 |             "content": "(Video Time = 14.0s) User: None",
114 |             "time": 14.0,
115 |             "fps": 5.317145007413968,
116 |             "cost": 0.5023784637451172
117 |         },
118 |         {
119 |             "role": "assistant",
120 |             "content": "(Video Time = 14.0s) Assistant:The man playing the saxophone is wearing a black suit.",
121 |             "time": 14.0,
122 |             "fps": 5.317145007413968,
123 |             "cost": 0.5023784637451172
124 |         },
125 |         {
126 |             "time": 15.0,
127 |             "fps": 5.642735714539166,
128 |             "cost": 0.014441251754760742
129 |         }
130 |     ]
131 | }


--------------------------------------------------------------------------------
/llava/serve/examples/videos/dance.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/videos/dance.mp4


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/gradio_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import TextStreamer
  3 | 
  4 | 
  5 | from llava.constants import DEFAULT_X_START_TOKEN, DEFAULT_X_TOKEN, DEFAULT_X_END_TOKEN, X_TOKEN_INDEX
  6 | from llava.conversation import conv_templates, SeparatorStyle
  7 | from llava.mm_utils import get_model_name_from_path, tokenizer_x_token, KeywordsStoppingCriteria
  8 | from llava.model.builder import load_pretrained_model
  9 | from llava.utils import disable_torch_init
 10 | 
 11 | 
 12 | title_markdown = ("""
 13 | <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
 14 |   <a href="https://github.com/patrick-tssn/VideoLLaMB" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
 15 |   </a>
 16 |   <div>
 17 |     <h1 >VideoLLaMB: Long Video Understanding with Recurrent Memory Bridges</h1>
 18 |     <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
 19 |   </div>
 20 | </div>
 21 | 
 22 | 
 23 | <div align="center">
 24 |     <div style="display:flex; gap: 0.25rem;" align="center">
 25 |         <a href='https://github.com/patrick-tssn/VideoLLaMB'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
 26 |     </div>
 27 | </div>
 28 | """)
 29 | 
 30 | block_css = """
 31 | #buttons button {
 32 |     min-width: min(120px,100%);
 33 | }
 34 | """
 35 | 
 36 | tos_markdown = ("""
 37 | ### Terms of use
 38 | By using this service, users are required to agree to the following terms:
 39 | The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
 40 | Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
 41 | For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
 42 | """)
 43 | 
 44 | learn_more_markdown = ("""
 45 | ### License
 46 | The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
 47 | """)
 48 | 
 49 | 
 50 | class Chat:
 51 |     def __init__(self, model_path, conv_mode, model_base=None, num_frames=16, load_8bit=False, load_4bit=False, device='cuda', cache_dir=None):
 52 |         disable_torch_init()
 53 |         model_name = get_model_name_from_path(model_path)
 54 |         self.tokenizer, self.model, processor, context_len = load_pretrained_model(model_path, model_base, model_name, num_frames,
 55 |                                                                                    load_8bit, load_4bit,
 56 |                                                                                    device=device, cache_dir=cache_dir)
 57 |         self.image_processor = processor['IMAGE']
 58 |         self.video_processor = processor['VIDEO']
 59 |         self.conv_mode = conv_mode
 60 |         self.conv = conv_templates[conv_mode].copy()
 61 |         self.device = self.model.device
 62 |         # print(self.model)
 63 | 
 64 |     def get_prompt(self, qs, state):
 65 |         state.append_message(state.roles[0], qs)
 66 |         state.append_message(state.roles[1], None)
 67 |         return state
 68 | 
 69 |     @torch.inference_mode()
 70 |     def generate(self, images_tensor: list, prompt: str, modality: str, first_run: bool, state):
 71 |         tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
 72 | 
 73 |         state = self.get_prompt(prompt, state)
 74 |         prompt = state.get_prompt()
 75 |         # print('\n\n\n')
 76 |         # print(prompt)
 77 | 
 78 |         input_ids = tokenizer_x_token(prompt, tokenizer, X_TOKEN_INDEX[modality], return_tensors='pt').unsqueeze(0).to(self.device)
 79 | 
 80 |         temperature = 0.2
 81 | 
 82 |         max_new_tokens = 1024
 83 | 
 84 |         stop_str = self.conv.sep if self.conv.sep_style != SeparatorStyle.TWO else self.conv.sep2
 85 |         keywords = [stop_str]
 86 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 87 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 88 |         print(prompt, input_ids, len(images_tensor), images_tensor[0].shape)
 89 |         with torch.inference_mode():
 90 |             output_ids = model.generate(
 91 |                 input_ids,
 92 |                 X=images_tensor,
 93 |                 X_modalities=[modality],
 94 |                 X_sizes=[None],
 95 |                 do_sample=True,
 96 |                 temperature=temperature,
 97 |                 max_new_tokens=max_new_tokens,
 98 |                 streamer=streamer,
 99 |                 use_cache=True,
100 |                 stopping_criteria=[stopping_criteria])
101 | 
102 |         # input_token_len = input_ids.shape[1]
103 |         # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
104 |         # if n_diff_input_output > 0:
105 |         #     print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
106 |         # outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
107 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
108 |         outputs = outputs.strip()
109 |         if outputs.endswith(stop_str):
110 |             outputs = outputs[:-len(stop_str)]
111 |         outputs = outputs.strip()
112 | 
113 |         print('response', outputs)
114 |         return outputs, state


--------------------------------------------------------------------------------
/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | import warnings
  3 | 
  4 | import torch
  5 | 
  6 | import transformers
  7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
  8 | 
  9 | try:
 10 |     from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 11 | except ImportError:
 12 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 13 | from flash_attn.bert_padding import unpad_input, pad_input
 14 | 
 15 | 
 16 | def forward(
 17 |     self,
 18 |     hidden_states: torch.Tensor,
 19 |     attention_mask: Optional[torch.Tensor] = None,
 20 |     position_ids: Optional[torch.Tensor] = None,
 21 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 22 |     output_attentions: bool = False,
 23 |     use_cache: bool = False,
 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 25 |     if output_attentions:
 26 |         warnings.warn(
 27 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 28 |         )
 29 | 
 30 |     bsz, q_len, _ = hidden_states.size()
 31 | 
 32 |     query_states = (
 33 |         self.q_proj(hidden_states)
 34 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 35 |         .transpose(1, 2)
 36 |     )
 37 |     key_states = (
 38 |         self.k_proj(hidden_states)
 39 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 40 |         .transpose(1, 2)
 41 |     )
 42 |     value_states = (
 43 |         self.v_proj(hidden_states)
 44 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 45 |         .transpose(1, 2)
 46 |     )  # shape: (b, num_heads, s, head_dim)
 47 | 
 48 |     kv_seq_len = key_states.shape[-2]
 49 |     if past_key_value is not None:
 50 |         kv_seq_len += past_key_value[0].shape[-2]
 51 | 
 52 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 53 |     query_states, key_states = apply_rotary_pos_emb(
 54 |         query_states, key_states, cos, sin, position_ids
 55 |     )
 56 | 
 57 |     if past_key_value is not None:
 58 |         # reuse k, v
 59 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 60 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 61 | 
 62 |     past_key_value = (key_states, value_states) if use_cache else None
 63 | 
 64 |     # repeat k/v heads if n_kv_heads < n_heads
 65 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
 66 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
 67 | 
 68 |     # Transform the data into the format required by flash attention
 69 |     qkv = torch.stack([query_states, key_states, value_states], dim=2)
 70 |     qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
 71 |     key_padding_mask = attention_mask
 72 | 
 73 |     if key_padding_mask is None:
 74 |         qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
 75 |         cu_q_lens = torch.arange(
 76 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 77 |         )
 78 |         max_s = q_len
 79 |         output = flash_attn_unpadded_qkvpacked_func(
 80 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 81 |         )
 82 |         output = output.view(bsz, q_len, -1)
 83 |     else:
 84 |         qkv = qkv.reshape(bsz, q_len, -1)
 85 |         qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
 86 |         qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
 87 |         output_unpad = flash_attn_unpadded_qkvpacked_func(
 88 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 89 |         )
 90 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 91 |         output = pad_input(output_unpad, indices, bsz, q_len)
 92 | 
 93 |     return self.o_proj(output), None, past_key_value
 94 | 
 95 | 
 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
 97 | # requires the attention mask to be the same as the key_padding_mask
 98 | def _prepare_decoder_attention_mask(
 99 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
100 | ):
101 |     # [bsz, seq_len]
102 |     return attention_mask
103 | 
104 | 
105 | def replace_llama_attn_with_flash_attn():
106 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
107 |     if cuda_major < 8:
108 |         warnings.warn(
109 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
110 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111 |         )
112 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113 |         _prepare_decoder_attention_mask
114 |     )
115 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116 | 


--------------------------------------------------------------------------------
/llava/train/llama_xformers_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
  3 | """
  4 | 
  5 | import logging
  6 | import math
  7 | from typing import Optional, Tuple
  8 | 
  9 | import torch
 10 | import transformers.models.llama.modeling_llama
 11 | from torch import nn
 12 | 
 13 | try:
 14 |     import xformers.ops
 15 | except ImportError:
 16 |     logging.error("xformers not found! Please install it before trying to use it.")
 17 | 
 18 | 
 19 | def replace_llama_attn_with_xformers_attn():
 20 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
 21 | 
 22 | 
 23 | def xformers_forward(
 24 |     self,
 25 |     hidden_states: torch.Tensor,
 26 |     attention_mask: Optional[torch.Tensor] = None,
 27 |     position_ids: Optional[torch.LongTensor] = None,
 28 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 29 |     output_attentions: bool = False,
 30 |     use_cache: bool = False,
 31 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 32 |     # pylint: disable=duplicate-code
 33 |     bsz, q_len, _ = hidden_states.size()
 34 | 
 35 |     query_states = (
 36 |         self.q_proj(hidden_states)
 37 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 38 |         .transpose(1, 2)
 39 |     )
 40 |     key_states = (
 41 |         self.k_proj(hidden_states)
 42 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 43 |         .transpose(1, 2)
 44 |     )
 45 |     value_states = (
 46 |         self.v_proj(hidden_states)
 47 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 48 |         .transpose(1, 2)
 49 |     )
 50 | 
 51 |     kv_seq_len = key_states.shape[-2]
 52 |     if past_key_value is not None:
 53 |         kv_seq_len += past_key_value[0].shape[-2]
 54 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 55 |     (
 56 |         query_states,
 57 |         key_states,
 58 |     ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
 59 |         query_states, key_states, cos, sin, position_ids
 60 |     )
 61 |     # [bsz, nh, t, hd]
 62 | 
 63 |     if past_key_value is not None:
 64 |         # reuse k, v, self_attention
 65 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 66 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 67 | 
 68 |     past_key_value = (key_states, value_states) if use_cache else None
 69 | 
 70 |     # We only apply xformers optimizations if we don't need to output the whole attention matrix
 71 |     if not output_attentions:
 72 |         query_states = query_states.transpose(1, 2)
 73 |         key_states = key_states.transpose(1, 2)
 74 |         value_states = value_states.transpose(1, 2)
 75 | 
 76 |         # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
 77 |         # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
 78 |         if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
 79 |             # input and output should be of form (bsz, q_len, num_heads, head_dim)
 80 |             attn_output = xformers.ops.memory_efficient_attention(
 81 |                 query_states, key_states, value_states, attn_bias=None
 82 |             )
 83 |         else:
 84 |             # input and output should be of form (bsz, q_len, num_heads, head_dim)
 85 |             attn_output = xformers.ops.memory_efficient_attention(
 86 |                 query_states,
 87 |                 key_states,
 88 |                 value_states,
 89 |                 attn_bias=xformers.ops.LowerTriangularMask(),
 90 |             )
 91 |         attn_weights = None
 92 |     else:
 93 |         attn_weights = torch.matmul(
 94 |             query_states, key_states.transpose(2, 3)
 95 |         ) / math.sqrt(self.head_dim)
 96 | 
 97 |         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
 98 |             raise ValueError(
 99 |                 f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
100 |                 f" {attn_weights.size()}"
101 |             )
102 | 
103 |         if attention_mask is not None:
104 |             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
105 |                 raise ValueError(
106 |                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
107 |                 )
108 |             attn_weights = attn_weights + attention_mask
109 |             attn_weights = torch.max(
110 |                 attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
111 |             )
112 | 
113 |         # upcast attention to fp32
114 |         attn_weights = nn.functional.softmax(
115 |             attn_weights, dim=-1, dtype=torch.float32
116 |         ).to(query_states.dtype)
117 |         attn_output = torch.matmul(attn_weights, value_states)
118 | 
119 |         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
120 |             raise ValueError(
121 |                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
122 |                 f" {attn_output.size()}"
123 |             )
124 | 
125 |         attn_output = attn_output.transpose(1, 2)
126 | 
127 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
128 |     attn_output = self.o_proj(attn_output)
129 |     return attn_output, attn_weights, past_key_value
130 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 |     # train()
6 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/llava/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from llava.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True, encoding='UTF-8')
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/playground/DATA.md:
--------------------------------------------------------------------------------
  1 | # Data
  2 | 
  3 | ## Instruction Data
  4 | 
  5 | Our training data are consist of video instruction and image instruction. You can download our preprocessed instruction data from [here](https://huggingface.co/datasets/ColorfulAI/VideoLLaMB-IT)
  6 | 
  7 | 1. video instruction
  8 | 
  9 | We use the video instruction data from [PLLaVA](https://github.com/magic-research/PLLaVA)
 10 | 
 11 | **Instruction**: You can download from [magic_jsons](https://huggingface.co/datasets/cathyxl/magic_jsons)
 12 | 
 13 | **Videos**: 
 14 | 
 15 | *Note: The Preprocessed links come from this [issue](https://github.com/OpenGVLab/Ask-Anything/issues/176) provided by the authors of VideoChat2. If the links are not available, please refer to their original [VideoChat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2).*
 16 | 
 17 | - [VideoChat](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data) Preprocessed [download link](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/videochat2_conversation_videos.zip)
 18 | - [VideoChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main/data) Direct [download link](https://mbzuaiac-my.sharepoint.com/personal/hanoona_bangalath_mbzuai_ac_ae/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fhanoona%5Fbangalath%5Fmbzuai%5Fac%5Fae%2FDocuments%2FVideo%2DChatGPT%2FData%5FCode%5FModel%5FRelease%2FData&ga=1)
 19 | - [Kinetics-710](https://github.com/OpenGVLab/UniFormerV2/blob/main/DATASET.md) Alternative [download link](https://openxlab.org.cn/datasets?keywords=kinetics)
 20 | - [SthSthV2](https://developer.qualcomm.com/software/ai-datasets/something-something)
 21 | - [NExTQA](https://github.com/doc-doc/NExT-QA)
 22 | - [CLEVRER](https://clevrer.csail.mit.edu/) Direct [download links](https://github.com/OpenGVLab/Ask-Anything/issues/176#issuecomment-2121805009)
 23 | - [YouCook2](https://youcook2.eecs.umich.edu/) Preprocessed [download link](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/youcook_split_videos.zip)
 24 | - [TextVR](https://github.com/callsys/textvr)
 25 | - [TGIF](https://github.com/YunseokJANG/tgif-qa)
 26 | - [EgoQA](https://ego4d-data.org/) Preprocessed [download link](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/egoqa_split_videos.zip)
 27 | 
 28 | 
 29 | <!-- For the complete VideoChat2 Videos, you can download the videos of WebVid dataset from: 
 30 | 
 31 | - [WebVid](https://maxbain.com/webvid-dataset/) -->
 32 | 
 33 | 2. image instruction
 34 | 
 35 | **Instruction**: [LLaVA-Instruct](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)
 36 | 
 37 | **Images**: [LLaVA-Instruct](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#visual-instruction-tuning)
 38 | 
 39 | 
 40 | We mix the video instruction and image instruction. These files are organized in the following structure
 41 | 
 42 | ```bash
 43 | playground/data/
 44 |     ├── llava_videochat2_filter.json
 45 |     ├── video/videochat
 46 |         ├── clevrer/video_train
 47 |         ├── egoqa
 48 |         ├── kinetic
 49 |             ├── k400
 50 |             ├── k600
 51 |             └── k700
 52 |         ├── nextqa
 53 |         ├── ssv2
 54 |         ├── textvr
 55 |             ├── Activity
 56 |             ├── Cooking
 57 |             ├── Driving
 58 |             ├── Games
 59 |             ├── News_Movie
 60 |             ├── Sports
 61 |             ├── Street_View_Indoor
 62 |             └── Street_View_Outdoor
 63 |         ├── tgif
 64 |         ├── videochat2
 65 |         ├── videochatgpt
 66 |         └── youcook
 67 |             ├── train
 68 |             └── validation
 69 |     ├── image
 70 |         ├── coco/train2017
 71 |         ├── gqa/images
 72 |         ├── ocr_vqa/images
 73 |         ├── textvqa/train_images
 74 |         └── vg
 75 |             ├── VG_100K
 76 |             └── VG_100K_2
 77 | ```
 78 | 
 79 | 
 80 | ## Evaluation Data
 81 | 
 82 | - [EgoScheme-subset](https://egoschema.github.io/)
 83 | 
 84 | - [NExTQA-val](https://egoschema.github.io/)
 85 | 
 86 | - [EgoPlan-test](https://github.com/ChenYi99/EgoPlan?tab=readme-ov-file#egoplan-evaluation-data): We cut the video clips by the `start_frame` and `end_frame`.  Preprocessed [download link](https://huggingface.co/datasets/ColorfulAI/EgoPlan_test)
 87 | 
 88 | - [MVBench](https://huggingface.co/datasets/OpenGVLab/MVBench)
 89 | 
 90 |  
 91 | These files are organized in the following structure
 92 | 
 93 | ```bash
 94 | playground/eval/GPT_Zero_Shot_QA
 95 |     ├── EgoSchema_Zero_Shot_QA
 96 |         ├── videos
 97 |         ├── test_q.json
 98 |         └── test_a.json
 99 |     ├── NExT_Zero_Shot_QA
100 |         ├── videos
101 |         ├── test_q.json
102 |         └── test_a.json
103 |     ├── EgoPlan_Zero_Shot_QA
104 |         ├── videos
105 |         ├── test_q.json
106 |         └── test_a.json
107 |     └── MVBench_Zero_Shot_QA
108 |         ├── videos
109 |             ├── clevrer
110 |             ├── FunQA_test
111 |             ├── Moments_in_Time_Raw
112 |             ├── nturgbd
113 |             ├── perception
114 |             ├── scene_qa
115 |             ├── ssv2_video
116 |             ├── sta
117 |             ├── star
118 |             ├── tvqa
119 |             └── vlnqa
120 |         ├── test_q.json
121 |         └── test_a.json
122 | ```
123 | 
124 | 
125 | 
126 | 
127 | To fit our evaluation pipeline, we reformat these texts into two files, `test_q.json` and `test_a.json`, in the following format:
128 | 
129 | test_q.json
130 | 
131 | ```json
132 | [
133 |     {
134 |         "video_name": "",
135 |         "question_id": "",
136 |         "question": "",
137 |         "option":{
138 |             "option 0": "",
139 |             "option 1": "",
140 |             "option 2": "",
141 |             "option 3": "",
142 |             "option 4": "",
143 |         },
144 |         "type": ""
145 |     }
146 | ]
147 | ```
148 | 
149 | test_a.json
150 | 
151 | ```json
152 | [
153 |     {
154 |         "answer": 0,
155 |         "question_id": "",
156 |     }
157 | ]
158 | ```
159 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.2.2.post1"
 8 | readme = "README.md"
 9 | requires-python = ">=3.8"
10 | classifiers = [
11 |     "Programming Language :: Python :: 3",
12 |     "License :: OSI Approved :: Apache Software License",
13 | ]
14 | dependencies = [
15 |     "torch==2.2.1", "torchvision==0.17.1",
16 |     "transformers==4.39.1", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
17 |     "accelerate==0.27.2", "peft", "bitsandbytes",
18 |     "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
19 |     "gradio==4.16.0", "gradio_client==0.8.1",
20 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi", "protobuf",
21 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.9.16",
22 |     "opencv-python", "imageio", "decord", "pytorchvideo"
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.14.0", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Bug Tracker" = "https://github.com/patrick-tssn/VideoLLaMB/issues"
31 | 
32 | [tool.setuptools.packages.find]
33 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
34 | 
35 | [tool.wheel]
36 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
37 | 


--------------------------------------------------------------------------------
/scripts/eval/egoplan.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | CKPT_NAME="llava-7b-ft-rmt1x-lvcn_32_8_pool12_new"
 4 | model_path="checkpoints/${CKPT_NAME}"
 5 | # CKPT_NAME="llava-vit-vivit-7b"
 6 | model_base="checkpoints/llava-v1.5-7b"
 7 | # model_base=None
 8 | cache_dir="./cache_dir"
 9 | GPT_Zero_Shot_QA="./playground/eval/GPT_Zero_Shot_QA"
10 | video_dir="${GPT_Zero_Shot_QA}/EgoPlan_Zero_Shot_QA/videos"
11 | gt_file_question="${GPT_Zero_Shot_QA}/EgoPlan_Zero_Shot_QA/test_q.json"
12 | gt_file_answers="${GPT_Zero_Shot_QA}/EgoPlan_Zero_Shot_QA/test_a.json"
13 | output_dir="${GPT_Zero_Shot_QA}/EgoPlan_Zero_Shot_QA/${CKPT_NAME}"
14 | NUM_FRAMES=16
15 | 
16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
17 | IFS=',' read -ra GPULIST <<< "$gpu_list"
18 | 
19 | CHUNKS=${#GPULIST[@]}
20 | 
21 |     #   --model_base ${model_base} \
22 | 
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_videoqa_mc \
25 |       --model_path ${model_path} \
26 |       --cache_dir ${cache_dir} \
27 |       --video_dir ${video_dir} \
28 |       --gt_file_question ${gt_file_question} \
29 |       --gt_file_answers ${gt_file_answers} \
30 |       --output_dir ${output_dir} \
31 |       --output_name ${CHUNKS}_${IDX} \
32 |       --num_frames $NUM_FRAMES \
33 |       --num_chunks $CHUNKS \
34 |       --chunk_idx $IDX &
35 | done
36 | 
37 | wait
38 | 
39 | output_file=${output_dir}/merge.jsonl
40 | 
41 | # Clear out the output file if it exists.
42 | > "$output_file"
43 | 
44 | # Loop through the indices and concatenate each file.
45 | for IDX in $(seq 0 $((CHUNKS-1))); do
46 |     cat ${output_dir}/${CHUNKS}_${IDX}.json >> "$output_file"
47 | done
48 | 
49 | python scripts/eval_acc.py --src $output_file --dst $GQADIR/subset.json
50 | 


--------------------------------------------------------------------------------
/scripts/eval/egoschema.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | CKPT_NAME="llava-7b-ft-rmt1x-lvcn_16_4_poo12_new_loss"
 4 | model_path="checkpoints/${CKPT_NAME}"
 5 | # CKPT_NAME="llava-vit-vivit-7b"
 6 | model_base="checkpoints/llava-v1.5-7b"
 7 | # model_base=None
 8 | cache_dir="./cache_dir"
 9 | GPT_Zero_Shot_QA="./playground/eval/GPT_Zero_Shot_QA"
10 | video_dir="${GPT_Zero_Shot_QA}/EgoSchema_Zero_Shot_QA/videos"
11 | gt_file_question="${GPT_Zero_Shot_QA}/EgoSchema_Zero_Shot_QA/test_q.json"
12 | gt_file_answers="${GPT_Zero_Shot_QA}/EgoSchema_Zero_Shot_QA/test_a.json"
13 | output_dir="${GPT_Zero_Shot_QA}/EgoSchema_Zero_Shot_QA/${CKPT_NAME}"
14 | NUM_FRAMES=16
15 | 
16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
17 | IFS=',' read -ra GPULIST <<< "$gpu_list"
18 | 
19 | CHUNKS=${#GPULIST[@]}
20 | 
21 |     #   --model_base ${model_base} \
22 | 
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_videoqa_mc \
25 |       --model_path ${model_path} \
26 |       --cache_dir ${cache_dir} \
27 |       --video_dir ${video_dir} \
28 |       --gt_file_question ${gt_file_question} \
29 |       --gt_file_answers ${gt_file_answers} \
30 |       --output_dir ${output_dir} \
31 |       --output_name ${CHUNKS}_${IDX} \
32 |       --num_frames $NUM_FRAMES \
33 |       --num_chunks $CHUNKS \
34 |       --chunk_idx $IDX &
35 | done
36 | 
37 | wait
38 | 
39 | output_file=${output_dir}/merge.jsonl
40 | 
41 | # Clear out the output file if it exists.
42 | > "$output_file"
43 | 
44 | # Loop through the indices and concatenate each file.
45 | for IDX in $(seq 0 $((CHUNKS-1))); do
46 |     cat ${output_dir}/${CHUNKS}_${IDX}.json >> "$output_file"
47 | done
48 | 
49 | python scripts/eval_acc.py --src $output_file --dst $GQADIR/subset.json
50 | 


--------------------------------------------------------------------------------
/scripts/eval/mvbench.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | CKPT_NAME="llava-7b-ft-rmtr1x-lvcn_16_4_pool12_full_new"
 4 | model_path="checkpoints/${CKPT_NAME}"
 5 | # CKPT_NAME="llava-vit-vivit-7b"
 6 | model_base="checkpoints/llava-v1.5-7b"
 7 | # model_base=None
 8 | cache_dir="./cache_dir"
 9 | GPT_Zero_Shot_QA="./playground/eval/GPT_Zero_Shot_QA"
10 | video_dir="${GPT_Zero_Shot_QA}/MVBench_Zero_Shot_QA/videos"
11 | gt_file_question="${GPT_Zero_Shot_QA}/MVBench_Zero_Shot_QA/test_q.json"
12 | gt_file_answers="${GPT_Zero_Shot_QA}/MVBench_Zero_Shot_QA/test_a.json"
13 | output_dir="${GPT_Zero_Shot_QA}/MVBench_Zero_Shot_QA/${CKPT_NAME}"
14 | NUM_FRAMES=8
15 | 
16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
17 | IFS=',' read -ra GPULIST <<< "$gpu_list"
18 | 
19 | CHUNKS=${#GPULIST[@]}
20 | 
21 |     #   --model_base ${model_base} \
22 | 
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_videoqa_mvbench \
25 |       --model_path ${model_path} \
26 |       --cache_dir ${cache_dir} \
27 |       --video_dir ${video_dir} \
28 |       --gt_file_question ${gt_file_question} \
29 |       --gt_file_answers ${gt_file_answers} \
30 |       --output_dir ${output_dir} \
31 |       --output_name ${CHUNKS}_${IDX} \
32 |       --num_frames $NUM_FRAMES \
33 |       --num_chunks $CHUNKS \
34 |       --chunk_idx $IDX &
35 | done
36 | 
37 | wait
38 | 
39 | output_file=${output_dir}/merge.jsonl
40 | 
41 | # Clear out the output file if it exists.
42 | > "$output_file"
43 | 
44 | # Loop through the indices and concatenate each file.
45 | for IDX in $(seq 0 $((CHUNKS-1))); do
46 |     cat ${output_dir}/${CHUNKS}_${IDX}.json >> "$output_file"
47 | done
48 | 
49 | python scripts/eval_acc.py --src $output_file --dst $GQADIR/subset.json
50 | 


--------------------------------------------------------------------------------
/scripts/eval/nextqa.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | CKPT_NAME="llava-7b-ft-rmtr1x-lvcn_16_8_pool12_next"
 4 | model_path="checkpoints/${CKPT_NAME}"
 5 | # CKPT_NAME="llava-vit-vivit-7b"
 6 | model_base="checkpoints/llava-v1.5-7b"
 7 | # model_base=None
 8 | cache_dir="./cache_dir"
 9 | GPT_Zero_Shot_QA="./playground/eval/GPT_Zero_Shot_QA"
10 | video_dir="${GPT_Zero_Shot_QA}/NExT_Zero_Shot_QA/videos"
11 | gt_file_question="${GPT_Zero_Shot_QA}/NExT_Zero_Shot_QA/test_q.json"
12 | gt_file_answers="${GPT_Zero_Shot_QA}/NExT_Zero_Shot_QA/test_a.json"
13 | output_dir="${GPT_Zero_Shot_QA}/NExT_Zero_Shot_QA/${CKPT_NAME}"
14 | NUM_FRAMES=8
15 | 
16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
17 | IFS=',' read -ra GPULIST <<< "$gpu_list"
18 | 
19 | CHUNKS=${#GPULIST[@]}
20 | 
21 |     #   --model_base ${model_base} \
22 | 
23 | for IDX in $(seq 0 $((CHUNKS-1))); do
24 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_videoqa_mc \
25 |       --model_path ${model_path} \
26 |       --cache_dir ${cache_dir} \
27 |       --video_dir ${video_dir} \
28 |       --gt_file_question ${gt_file_question} \
29 |       --gt_file_answers ${gt_file_answers} \
30 |       --output_dir ${output_dir} \
31 |       --output_name ${CHUNKS}_${IDX} \
32 |       --num_frames $NUM_FRAMES \
33 |       --num_chunks $CHUNKS \
34 |       --chunk_idx $IDX &
35 | done
36 | 
37 | wait
38 | 
39 | output_file=${output_dir}/merge.jsonl
40 | 
41 | # Clear out the output file if it exists.
42 | > "$output_file"
43 | 
44 | # Loop through the indices and concatenate each file.
45 | for IDX in $(seq 0 $((CHUNKS-1))); do
46 |     cat ${output_dir}/${CHUNKS}_${IDX}.json >> "$output_file"
47 | done
48 | 
49 | python scripts/eval_nextqa.py --src $output_file --dst $GQADIR/subset.json
50 | 


--------------------------------------------------------------------------------
/scripts/eval_acc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import collections
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--src", type=str)
 8 | parser.add_argument("--dst", type=str)
 9 | args = parser.parse_args()
10 | 
11 | all_answers = []
12 | acc, total = 0, 0
13 | type_dct = collections.defaultdict(list)
14 | for line_idx, line in enumerate(open(args.src)):
15 |     res = json.loads(line)
16 |     question_id = res['question']
17 |     answer = res['answer']
18 |     text = res['pred'].strip('.')
19 |     # if res['type'] == 'fine_grained_pose': continue
20 |     if answer == text:
21 |         acc += 1
22 |     total += 1
23 |     if 'type' in res:
24 |         typeid = res['type']
25 |         type_dct[typeid].append(int(answer==text))
26 |     # all_answers.append({
27 |     #     "questionId": question_id, 
28 |     #     "prediction": text,
29 |     #     "answer": answer,
30 |     #     })
31 | print(text, answer)
32 | print('Accuracy: ', acc/total)
33 | for tpidx, tpacc in type_dct.items():
34 |     tpacc = sum(tpacc) / len(tpacc)
35 |     print(f"Accuraccy for Type {tpidx}: {tpacc}")
36 | # with open(args.dst, 'w') as f:
37 | #     json.dump(all_answers, f)
38 | 


--------------------------------------------------------------------------------
/scripts/eval_moviechat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import collections
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--src", type=str)
 8 | parser.add_argument("--dst", type=str)
 9 | args = parser.parse_args()
10 | 
11 | all_answers = {}
12 | acc, total = 0, 0
13 | type_dct = collections.defaultdict(list)
14 | for line_idx, line in enumerate(open(args.src)):
15 |     res = json.loads(line)
16 |     # question_id = res['question']
17 |     # answer = res['answer']
18 |     # text = res['pred'].strip('.')
19 |     # if answer == text:
20 |     #     acc += 1
21 |     # total += 1
22 |     # if 'type' in res:
23 |     #     typeid = res['type']
24 |     #     type_dct[typeid].append(int(answer==text))
25 |     
26 |     all_answers.update(res)
27 | 
28 | # print(text, answer)
29 | # print('Accuracy: ', acc/total)
30 | # for tpidx, tpacc in type_dct.items():
31 | #     tpacc = sum(tpacc) / len(tpacc)
32 | #     print(f"Accuraccy for Type {tpidx}: {tpacc}")
33 | with open(args.dst, 'w') as f:
34 |     json.dump(all_answers, f, indent=4)
35 | 


--------------------------------------------------------------------------------
/scripts/eval_nextqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import collections
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--src", type=str)
 8 | parser.add_argument("--dst", type=str)
 9 | args = parser.parse_args()
10 | 
11 | 
12 | all_answers = []
13 | acc, total = 0, 0
14 | type_dct = collections.defaultdict(list)
15 | for line_idx, line in enumerate(open(args.src)):
16 |     res = json.loads(line)
17 |     question_id = res['question']
18 |     answer = res['answer']
19 |     text = res['pred'].strip('.')
20 |     if answer == text:
21 |         acc += 1
22 |     total += 1
23 |     typeid = res['type'][0]
24 |     type_dct[typeid].append(int(answer == text))
25 |     # all_answers.append({
26 |     #     "questionId": question_id, 
27 |     #     "prediction": text,
28 |     #     "answer": answer,
29 |     #     })
30 | print(text, answer)
31 | print('Accuracy: ', acc/total)
32 | for typeid, typeacc in type_dct.items():
33 |     typeacc = sum(typeacc) / len(typeacc)
34 |     print(f"Accuracy for Type {typeid}: {typeacc}")
35 | 
36 | # with open(args.dst, 'w') as f:
37 | #     json.dump(all_answers, f)
38 | 


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/scripts/finetune_video_image.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=MB
 3 | #SBATCH --partition=DGX
 4 | ##SBATCH --exclude=hgx-hyperplane[02]
 5 | #SBATCH --account=research
 6 | #SBATCH --qos=lv2
 7 | #SBATCH --time=3-00:00:00
 8 | #SBATCH --nodes=1
 9 | #SBATCH --ntasks-per-node=4
10 | #SBATCH --gres=gpu:4
11 | #SBATCH --output=./slurm_logs/train-mbl-llava-7b.out
12 | #SBATCH --error=./slurm_logs/train-mbl-llava-7b.error.out
13 | 
14 | # --freeze_mm_mlp_adapter True \
15 | # --master_port 18983 
16 | # export NCCL_DEBUG=INFO
17 | # export NCCL_DEBUG_SUBSYS=ALL
18 |     # --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-7b/mm_projector.bin \
19 | deepspeed --master_port 40615 llava/train/train_mem.py \
20 |     --deepspeed ./scripts/zero1.json \
21 |     --model_name_or_path ./checkpoints/llava-v1.5-7b \
22 |     --version v1 \
23 |     --data_path ./playground/data/llava_pllava_filter.json \
24 |     --x_folder ./playground/data \
25 |     --video_tower ./checkpoints/LanguageBind_Video_merge \
26 |     --image_tower ./checkpoints/LanguageBind_Image \
27 |     --freeze_image_tower True \
28 |     --freeze_video_tower True \
29 |     --num_frames 16 \
30 |     --X "VIDEO" "IMAGE" \
31 |     --mm_projector_type rmt_r_transformer1x \
32 |     --mm_vision_select_layer -2 \
33 |     --mm_use_x_start_end False \
34 |     --mm_use_x_patch_token False \
35 |     --image_aspect_ratio pad \
36 |     --group_by_modality_length True \
37 |     --bf16 True \
38 |     --output_dir ./checkpoints/videollamb-llava-7b \
39 |     --num_train_epochs 1 \
40 |     --per_device_train_batch_size 8 \
41 |     --per_device_eval_batch_size 2 \
42 |     --gradient_accumulation_steps 1 \
43 |     --evaluation_strategy "no" \
44 |     --save_strategy "steps" \
45 |     --save_steps 50000 \
46 |     --save_total_limit 1 \
47 |     --learning_rate 2e-5 \
48 |     --weight_decay 0. \
49 |     --warmup_ratio 0.03 \
50 |     --lr_scheduler_type "cosine" \
51 |     --logging_steps 1 \
52 |     --tf32 True \
53 |     --model_max_length 2048 \
54 |     --gradient_checkpointing True \
55 |     --dataloader_num_workers 4 \
56 |     --lazy_preprocess True \
57 |     --report_to tensorboard


--------------------------------------------------------------------------------
/scripts/finetune_video_image_loss.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=MB-loss
 3 | #SBATCH --partition=DGX
 4 | ##SBATCH --exclude=hgx-hyperplane[02]
 5 | #SBATCH --account=research
 6 | #SBATCH --qos=lv2
 7 | #SBATCH --time=3-00:00:00
 8 | #SBATCH --nodes=1
 9 | #SBATCH --ntasks-per-node=4
10 | #SBATCH --gres=gpu:4
11 | #SBATCH --output=./slurm_logs/train-mbl-llava-7b.out
12 | #SBATCH --error=./slurm_logs/train-mbl-llava-7b.error.out
13 | 
14 | # --freeze_mm_mlp_adapter True \
15 | # --master_port 18983 
16 | # export NCCL_DEBUG=INFO
17 | # export NCCL_DEBUG_SUBSYS=ALL
18 |     # --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-7b/mm_projector.bin \
19 | deepspeed --master_port 40615 llava/train/train_mem.py \
20 |     --deepspeed ./scripts/zero1.json \
21 |     --model_name_or_path ./checkpoints/llava-v1.5-7b_rmt \
22 |     --version v1 \
23 |     --data_path ./playground/data/llava_pllava_filter.json \
24 |     --x_folder ./playground/data \
25 |     --video_tower ./checkpoints/LanguageBind_Video_merge \
26 |     --image_tower ./checkpoints/LanguageBind_Image \
27 |     --freeze_image_tower True \
28 |     --freeze_video_tower True \
29 |     --num_frames 16 \
30 |     --X "VIDEO" "IMAGE" \
31 |     --mm_projector_type rmt_r_transformer1x \
32 |     --mm_vision_select_layer -2 \
33 |     --mm_use_x_start_end False \
34 |     --mm_use_x_patch_token False \
35 |     --image_aspect_ratio pad \
36 |     --group_by_modality_length True \
37 |     --bf16 True \
38 |     --output_dir ./checkpoints/videollamb-llava-7b \
39 |     --num_train_epochs 1 \
40 |     --per_device_train_batch_size 4 \
41 |     --per_device_eval_batch_size 2 \
42 |     --gradient_accumulation_steps 1 \
43 |     --evaluation_strategy "no" \
44 |     --save_strategy "steps" \
45 |     --save_steps 50000 \
46 |     --save_total_limit 1 \
47 |     --learning_rate 2e-5 \
48 |     --weight_decay 0. \
49 |     --warmup_ratio 0.03 \
50 |     --lr_scheduler_type "cosine" \
51 |     --logging_steps 1 \
52 |     --tf32 True \
53 |     --model_max_length 2048 \
54 |     --gradient_checkpointing True \
55 |     --dataloader_num_workers 4 \
56 |     --lazy_preprocess True \
57 |     --report_to tensorboard


--------------------------------------------------------------------------------
/scripts/gather_moviechat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/VideoLLaMB/962837c5b310559de18b375eaee20561123bb54c/scripts/gather_moviechat.py


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 1,
18 |         "overlap_comm": false,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": false,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "offload_optimizer": {
19 |             "device": "cpu"
20 |         },
21 |         "overlap_comm": false,
22 |         "contiguous_gradients": true,
23 |         "sub_group_size": 1e9,
24 |         "reduce_bucket_size": "auto"
25 |     }
26 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------