├── README.md
├── assets
    ├── Algorithm.png
    ├── compare.png
    ├── ex1.png
    ├── ex2.jpg
    ├── framework.png
    ├── hd.png
    ├── high-reso.jpg
    ├── projector_comparsion.jpg
    ├── title.png
    ├── vis-1.jpg
    └── vis-2.jpg
├── docs
    └── evaluation.md
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_docvqa.py
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_ocr_bench.py
    │   ├── eval_pope.py
    │   ├── eval_textvqa.py
    │   ├── m4c_evaluator.py
    │   ├── mmmu
    │   │   └── eval
    │   │   │   ├── README.md
    │   │   │   ├── answer_dict_val.json
    │   │   │   ├── configs
    │   │   │       └── llava1.5.yaml
    │   │   │   ├── convert_to_test.py
    │   │   │   ├── eval.py
    │   │   │   ├── main_eval_only.py
    │   │   │   ├── main_parse_and_eval.py
    │   │   │   ├── print_results.py
    │   │   │   ├── run_llava.py
    │   │   │   └── utils
    │   │   │       ├── __pycache__
    │   │   │           ├── data_utils.cpython-310.pyc
    │   │   │           ├── eval_utils.cpython-310.pyc
    │   │   │           └── model_utils_ind.cpython-310.pyc
    │   │   │       ├── data_utils.py
    │   │   │       ├── eval_utils.py
    │   │   │       ├── model_utils.py
    │   │   │       └── model_utils_ind.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_loader_pope.py
    │   ├── model_vqa_mmbench.py
    │   ├── run_llava.py
    │   └── summarize_gpt_review.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_llama.py
    │   │   ├── llava_mpt.py
    │   │   └── mpt
    │   │   │   ├── __pycache__
    │   │   │       ├── adapt_tokenizer.cpython-310.pyc
    │   │   │       ├── attention.cpython-310.pyc
    │   │   │       ├── blocks.cpython-310.pyc
    │   │   │       ├── configuration_mpt.cpython-310.pyc
    │   │   │       ├── custom_embedding.cpython-310.pyc
    │   │   │       ├── flash_attn_triton.cpython-310.pyc
    │   │   │       ├── hf_prefixlm_converter.cpython-310.pyc
    │   │   │       ├── meta_init_context.cpython-310.pyc
    │   │   │       ├── modeling_mpt.cpython-310.pyc
    │   │   │       ├── norm.cpython-310.pyc
    │   │   │       └── param_init_fns.cpython-310.pyc
    │   │   │   ├── adapt_tokenizer.py
    │   │   │   ├── attention.py
    │   │   │   ├── blocks.py
    │   │   │   ├── configuration_mpt.py
    │   │   │   ├── custom_embedding.py
    │   │   │   ├── flash_attn_triton.py
    │   │   │   ├── hf_prefixlm_converter.py
    │   │   │   ├── meta_init_context.py
    │   │   │   ├── modeling_mpt.py
    │   │   │   ├── norm.py
    │   │   │   └── param_init_fns.py
    │   ├── llava_arch.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── patch_divide.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   └── train_mem.py
    └── utils.py
├── pyproject.toml
└── scripts
    ├── convert_docvqa_for_eval.py
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── extract_mm_projector.py
    ├── finetune.sh
    ├── finetune_full_schedule.sh
    ├── finetune_lora.sh
    ├── finetune_qlora.sh
    ├── merge_lora_weights.py
    ├── pretrain.sh
    ├── v1_5
        ├── eval
        │   ├── docvqa.sh
        │   ├── gqa.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu_val.sh
        │   ├── mmvet.sh
        │   ├── ocr_bench.sh
        │   ├── pope.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune.sh
        ├── finetune_hd.sh
        ├── pretrain.sh
        └── pretrain_hd.sh
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json


/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <p align="center" width="100%">
  3 | <img src="assets/title.png"  width="90%">
  4 | </p>
  5 | 
  6 | 
  7 | <div align=center>
  8 | <a href="" target="_blank">
  9 |     <img alt="TokenPacker-v1" src="https://img.shields.io/badge/TokenPaker-v1-BFE57E" height="25" />
 10 | </a>
 11 | <a href="https://arxiv.org/abs/2407.02392" target="_blank">
 12 |     <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2407.02392-red?logo=arxiv" height="25" />
 13 | </a>
 14 | <a href="https://huggingface.co/collections/sunshine-lwt/tokenpacker-66a234618f0d2327e0cf2cb1" target="_blank">
 15 |     <img alt="HF Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20_Model-HuggingFace-ffc107?color=ffc107&logoColor=white" height="25" />
 16 | </a>
 17 | <a href="https://zhuanlan.zhihu.com/p/707021763" target="_blank">
 18 |     <img alt="ZhiHu" src="https://img.shields.io/badge/Blog-ZhiHu-1E90FF?logo=zhihu&logoColor=02B5FD" height="25" />
 19 | </a>   
 20 |  </div>
 21 | 
 22 | 
 23 | ---
 24 | 
 25 | ## Comparisons with existing methods 💡
 26 | <!-- <img src="./assets/compare.png" width="80%"> -->
 27 | <p align="center" width="100%">
 28 | <img src="./assets/compare.png"  width="60%">
 29 | </p>
 30 | 
 31 | ## Updates 📌
 32 | - [2025/5/23] TokenPacker is accepted by **IJCV** 🎉🎉🎉. 
 33 | - [2024/10/22] We integrated TokenPacker-HD framework with [Osprey](https://github.com/CircleRadon/Osprey) to achieve fine-grained high-resolution pixel-level understanding with large performance gains. Please see the codes in this [branch](https://github.com/CircleRadon/TokenPacker/tree/tokenpacker-hd-osprey) for your reference. 
 34 | - [2024/7/25] We released [checkpoints](https://huggingface.co/collections/sunshine-lwt/tokenpacker-66a234618f0d2327e0cf2cb1), please check them.
 35 | - [2024/7/3] We released the [paper](https://arxiv.org/abs/2407.02392) of our TokenPacker on Arxiv.
 36 | - [2024/7/3] We released the training and inference codes. 
 37 | 
 38 | 
 39 | ## What is TokenPacker 👀
 40 | TokenPacker is a novel visual projector, which adopts a `coarse-to-fine` scheme
 41 | to inject the enriched characteristics to generate the condensed visual tokens. Using TokenPacker, we can compress the
 42 | visual tokens by **75%∼89%**, while achieves comparable or even better performance
 43 | across diverse benchmarks with significantly higher efficiency.
 44 | <img src="./assets/framework.png" width="800px">
 45 | 
 46 | #### Algorithms
 47 | We provide the pseudo-codes to showcase the detailed processing flow.
 48 | <img src="./assets/Algorithm.png" width="800px">
 49 | 
 50 | #### Core codes
 51 | As a visual projector, TokenPacker is implemented by a `class TokenPacker`, which can be found in [multimodal_projector/builder.py](./llava/model/multimodal_projector/builder.py#L39)
 52 | 
 53 | #### Comparisons with various projectors 
 54 | <img src="./assets/projector_comparsion.jpg" width="800px">
 55 | 
 56 | 
 57 | ## High-Resolution Image Understanding with TokenPacker 🔬
 58 | To support efficient `high-resolution` image understanding, we further develop an effective image
 59 | cropping method `TokenPacker-HD`.
 60 | <img src="./assets/hd.png" width="800px">
 61 | 
 62 | 
 63 | ## Install 🛠️
 64 | 1. Clone this repository and navigate to TokenPacker folder
 65 | ```
 66 | git clone https://github.com/CircleRadon/TokenPacker.git
 67 | cd TokenPacker
 68 | ```
 69 | 2. Install packages
 70 | ```
 71 | conda create -n tokenpacker python=3.10 -y
 72 | conda activate tokenpacker
 73 | pip install --upgrade pip  # enable PEP 660 support
 74 | pip install -e .
 75 | ```
 76 | 3. Install additional packages for training cases
 77 | ```
 78 | pip install -e ".[train]"
 79 | pip install flash-attn --no-build-isolation
 80 | ```
 81 | 
 82 | ## Training 🚀
 83 | 
 84 | ### LLaVA-TokenPacker
 85 | 
 86 | #### Dataset
 87 | To make a fair comparison, we use the same training data as in [LLaVA-1.5](https://github.com/haotian-liu/LLaVA), i.e., [LLaVA-Pretrain-558K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/tree/main) for stage 1, and  [Mix665k](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main) for stage 2.
 88 | 
 89 | #### Training 
 90 | - Stage1: Image-Text Alignment Pre-training
 91 | ```shell
 92 | bash scripts/v1_5/pretrain.sh
 93 | ```
 94 | - Stage2: Visual Instruction Tuning
 95 | ```shell
 96 | bash scripts/v1_5/finetune.sh
 97 | ```
 98 | Note: Using `--scale_factor` to control compression ratio, support [2,3,4]
 99 | 
100 | ### LLaVA-TokenPacker-HD
101 | 
102 | #### Dataset
103 | To obtain the competitive high-resolution performance, we use 2.7M data as organized by [Mini-Gemini](https://github.com/dvlab-research/MGM#Dataset), i.e., 1.2M for stage 1 and 1.5M for stage 2.
104 | 
105 | #### Training 
106 | - Stage1: Image-Text Alignment Pre-training
107 | ```shell
108 | bash scripts/v1_5/pretrain_hd.sh
109 | ```
110 | - Stage2: Visual Instruction Tuning
111 | ```shell
112 | bash scripts/v1_5/finetune_hd.sh
113 | ```
114 | 
115 | Note: 
116 | - Using `--scale_factor` to control compression ratio, support [2,3,4].
117 | - Using `--patch_num` to control max patch dividing number, support [9,16,25].
118 | 
119 | 
120 | ## Experiments
121 | 
122 | <img src="./assets/ex1.png" width="800px">
123 | 
124 | <img src="./assets/high-reso.jpg" width="800px">
125 | 
126 | 
127 | ## Model Zoo
128 | 
129 | | Model              |  Max Res.   |  Compre. Ratio  |  Token Num.  |  Max Patch Num.  |                                           Training Data                                            | Download                                                                              |
130 | |--------------------|:-----------:|:---------------:|:------------:|:----------------:|:--------------------------------------------------------------------------------------------------:|---------------------------------------------------------------------------------------|
131 | | TokenPacker-7b     |   336x336   |       1/4       |     144      |        -         |                                             558K+665K                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-7b-144token/tree/main)  |
132 | | TokenPacker-13b     |   336x336   |       1/4       |     144      |        -         |                                             558K+665K                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-13b-144token/tree/main) |
133 | | TokenPacker-HD-7b  |  1088x1088  |       1/4       |     ~954     |        9         |                                             1.2M+1.5M                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-7b-9patch-144token/tree/main) |
134 | | TokenPacker-HD-13b |  1088x1088  |       1/4       |     ~954     |        9         |                                             1.2M+1.5M                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-9patch-144token/tree/main) |
135 | | TokenPacker-HD-13b |  1344x1344  |       1/4       |    ~1393     |        16        |                                             1.2M+1.5M                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-144token/tree/main) |
136 | | TokenPacker-HD-13b |  1344x1344  |       1/9       |     ~619     |        16        |                                             1.2M+1.5M                                              | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-64token/tree/main)                                                                       |
137 | | TokenPacker-HD-13b |  1344x1344  |      1/16       |     ~347     |        16        |                                             1.2M+1.5M                                              |  [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-36token/tree/main)                                                                      |
138 | 
139 | Note: 
140 | - The `token number` of TokenPacker-HD is the `average` statistically across all training and test data.
141 | - The training data of `558K+665K` follows LLaVA-1.5, the one of `1.2M+1.5M` follows Mini-Gemini.
142 | - All LLMs use Vicuna-7b/13b  as based LLM.
143 | 
144 | 
145 | ## Visualization
146 | We provide some visual examples.
147 | 
148 | <img src="./assets/vis-1.jpg" width="800px">
149 | 
150 | 
151 | High-resolution image understanding.
152 | <img src="./assets/vis-2.jpg" width="800px">
153 | 
154 | 
155 | ## TODO List 📝
156 | - [x] Release the training and inference codes.
157 | - [x] Release all checkpoints.
158 | 
159 | 
160 | ## Acknowledgement 💌
161 | - [LLaVA-v1.5](https://github.com/haotian-liu/LLaVA): the codebase we built upon.
162 | - [Mini-Gemini](https://github.com/dvlab-research/MGM): the organized data we used for training high-resolution method.
163 |   
164 | ## More ## 
165 | For more recent related works, please refer to this repo of  [Awesome-Token-Compress](https://github.com/daixiangzi/Awesome-Token-Compress).
166 | 
167 | ## BibTeX 🖊️
168 | ```
169 | @misc{TokenPacker,
170 |   title={TokenPacker: Efficient Visual Projector for Multimodal LLM},
171 |   author={Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jianke Zhu and Lei Zhang},
172 |   year={2024},
173 |   eprint={2407.02392},
174 |   archivePrefix={arXiv},
175 |   primaryClass={cs.CV}
176 | }
177 | ```
178 | 


--------------------------------------------------------------------------------
/assets/Algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/Algorithm.png


--------------------------------------------------------------------------------
/assets/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/compare.png


--------------------------------------------------------------------------------
/assets/ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex1.png


--------------------------------------------------------------------------------
/assets/ex2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex2.jpg


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/framework.png


--------------------------------------------------------------------------------
/assets/hd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/hd.png


--------------------------------------------------------------------------------
/assets/high-reso.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/high-reso.jpg


--------------------------------------------------------------------------------
/assets/projector_comparsion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/projector_comparsion.jpg


--------------------------------------------------------------------------------
/assets/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/title.png


--------------------------------------------------------------------------------
/assets/vis-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-1.jpg


--------------------------------------------------------------------------------
/assets/vis-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-2.jpg


--------------------------------------------------------------------------------
/docs/evaluation.md:
--------------------------------------------------------------------------------
 1 | # Evaluation
 2 | 
 3 | ## Docvqa
 4 | 1. Download `test_v1.0.json` to `./playground/data/eval/docvqa/data`.
 5 | 2. set `--image-folder` to the path of [docvqa](https://rrc.cvc.uab.es/?ch=17&com=downloads) images.
 6 | 3. Multi-GPU inference.
 7 | ```
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/docvqa.sh
 9 | ```
10 | 4. Submit the results to the [evaluation server](https://rrc.cvc.uab.es/?ch=17&com=evaluation&task=1): `./playground/data/eval/docvqa/answers/`
11 | 
12 | 
13 | ## GQA
14 | 1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
15 | Multi-GPU inference.
16 | 2. Multi-GPU inference.
17 | ```
18 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
19 | ```
20 | 
21 | ## MMBench
22 | 1. Download [mmbench_dev_20230712.tsv](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
23 | 2. Single-GPU inference.
24 | ```
25 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
26 | ```
27 | 3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.
28 | 
29 | 
30 | ## MME
31 | 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
32 | 2. Downloaded images to `MME_Benchmark_release_version`.
33 | 3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
34 | 4. Single-GPU inference and evaluate.
35 | ```Shell
36 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
37 | ```
38 | 
39 | ## MMMU_val
40 | 1. Download the [data](https://huggingface.co/datasets/MMMU/MMMU/tree/main).
41 | 2. Set `--data_path` to the path to MMMU images.
42 | 3. Multi-GPU inference.
43 | ```
44 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/mmmu_val.sh
45 | ```
46 | 
47 | ## MM-Vet
48 | 1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
49 | 2. Single-GPU inference.
50 | ```Shell
51 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
52 | ```
53 | 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
54 | 
55 | ## OCRBench
56 | 1. Download the [data](https://github.com/Yuliang-Liu/MultimodalOCR).
57 | 2. Set `--image_folder` to the path to OCRBench images, set `--OCRBench_file` to the json file of OCRBench.
58 | 3. Single-GPU inference.
59 | ```Shell
60 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/ocr_bench.sh
61 | ```
62 | 
63 | ## POPE
64 | 1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
65 | 2. Single-GPU inference and evaluate.
66 | ```Shell
67 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
68 | ```
69 | 
70 | ### TextVQA
71 | 1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and extract to `./playground/data/eval/textvqa`. 
72 | 2. Download[images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and set `--image-folder` to the path to textvqa images.
73 | 2. Single-GPU inference and evaluate.
74 | ```Shell
75 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
76 | ```
77 | 
78 | ## Vizwiz
79 | 1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
80 | 2. Single-GPU inference.
81 | ```Shell
82 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
83 | ```
84 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.
85 | 
86 | 
87 | ## VQAv2
88 | 1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and set `--image-folder` to the path to `test2015`.
89 | 2. Multi-GPU inference.
90 | ```Shell
91 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
92 | ```
93 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.
94 | 


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 | 
 86 |         if isinstance(inst['caption'], list):
 87 |             cap_str = '\n'.join(inst['caption'])
 88 |         else:
 89 |             cap_str = inst['caption']
 90 | 
 91 |         category = 'llava_bench_' + json.loads(ques_js)['category']
 92 |         if category in rule_dict:
 93 |             rule = rule_dict[category]
 94 |         else:
 95 |             assert False, f"Visual QA category not found in rule file: {category}."
 96 |         prompt = rule['prompt']
 97 |         role = rule['role']
 98 |         content = (f'[Context]\n{cap_str}\n\n'
 99 |                    f'[Question]\n{ques["text"]}\n\n'
100 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102 |                    f'[System]\n{prompt}\n\n')
103 |         cur_js = {
104 |             'id': idx+1,
105 |             'question_id': ques['question_id'],
106 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
107 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108 |             'category': category
109 |         }
110 |         if idx >= len(cur_reviews):
111 |             review = get_eval(content, args.max_tokens)
112 |             scores = parse_score(review)
113 |             cur_js['content'] = review
114 |             cur_js['tuple'] = scores
115 |             review_file.write(json.dumps(cur_js) + '\n')
116 |             review_file.flush()
117 |         else:
118 |             print(f'Skipping {idx} as we already have it.')
119 |         idx += 1
120 |         print(idx)
121 |     review_file.close()
122 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 |         cap_str = '\n'.join(inst['captions'])
 86 |         box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             assert False, f"Visual QA category not found in rule file: {category}."
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
 96 |                    f'[Question]\n{ques["text"]}\n\n'
 97 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 98 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 99 |                    f'[System]\n{prompt}\n\n')
100 |         cur_js = {
101 |             'id': idx+1,
102 |             'question_id': ques['question_id'],
103 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
104 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105 |             'category': category
106 |         }
107 |         if idx >= len(cur_reviews):
108 |             review = get_eval(content, args.max_tokens)
109 |             scores = parse_score(review)
110 |             cur_js['content'] = review
111 |             cur_js['tuple'] = scores
112 |             review_file.write(json.dumps(cur_js) + '\n')
113 |             review_file.flush()
114 |         else:
115 |             print(f'Skipping {idx} as we already have it.')
116 |         idx += 1
117 |         print(idx)
118 |     review_file.close()
119 | 


--------------------------------------------------------------------------------
/llava/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/README.md:
--------------------------------------------------------------------------------
  1 | # Evaluation Guidelines
  2 | We provide detailed instructions for evaluation. 
  3 | To execute our evaluation script, please ensure that the structure of your model outputs is the same as ours.
  4 | 
  5 | We provide two options:
  6 | 1. Evaluation only: you can parse the response on your own and simply provide one file with all the final predictions.
  7 | 2. Parse and evaluation: you can leave all the responses to us with the output formats shown below.
  8 | 
  9 | ## Evaluation Only
 10 | If you want to use your own parsing logic and *only provide the final answer*, you can use `main_eval_only.py`.
 11 | 
 12 | You can provide all the outputs in *one file* in the following format:
 13 | 
 14 | ```
 15 | {
 16 |     "validation_Accounting_1": "D", # strictly "A", "B", "C", "D" for multi-choice question
 17 |     "validation_Architecture_and_Engineering_14": "0.0", # any string response for open question.
 18 |     ...
 19 | }
 20 | ```
 21 | Then run eval_only with:
 22 | ```
 23 | python main_eval_only.py --output_path ./example_outputs/llava1.5_13b/total_val_output.json
 24 | ```
 25 | 
 26 | Please refer to [example output](https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/example_outputs/llava1.5_13b/total_val_output.json) for a detailed prediction file form.
 27 | 
 28 | 
 29 | ## Parse and Evaluation
 30 | You can also provide response and run the `main_parse_and_eval.py` to use our answer parsing processing and evaluation pipeline as follows:
 31 | 
 32 | ### Output folder structure
 33 | 
 34 | ```
 35 | └── model_name
 36 |     ├── category_name (e.g., Accounting)
 37 |     │   ├── output.json
 38 |     └── category_name (e.g., Electronics)
 39 |         ├── output.json
 40 |     ...
 41 | ```
 42 | 
 43 | ### Output file
 44 | Each `output.json`` has a list of dict containing instances for evaluation ().
 45 | ```
 46 | [
 47 |     {
 48 |         "id": "validation_Electronics_28",
 49 |         "question_type": "multiple-choice",
 50 |         "answer": "A", # given answer
 51 |         "all_choices": [ # create using `get_multi_choice_info` in 
 52 |             "A",
 53 |             "B",
 54 |             "C",
 55 |             "D"
 56 |         ],
 57 |         "index2ans": { # create using `get_multi_choice_info` in 
 58 |             "A": "75 + 13.3 cos(250t - 57.7°)V",
 59 |             "B": "75 + 23.3 cos(250t - 57.7°)V",
 60 |             "C": "45 + 3.3 cos(250t - 57.7°)V",
 61 |             "D": "95 + 13.3 cos(250t - 57.7°)V"
 62 |         },
 63 |         "response": "B" # model response
 64 |     },
 65 |     {
 66 |         "id": "validation_Electronics_29",
 67 |         "question_type": "short-answer",
 68 |         "answer": "30", # given answer
 69 |         "response": "36 watts" # model response
 70 |     },
 71 |     ...
 72 | ]
 73 | ```
 74 | 
 75 | ### Evaluation
 76 | ```
 77 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject ALL # all subject
 78 | 
 79 | # OR you can sepecify one subject for the evaluation
 80 | 
 81 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject elec # short name for Electronics. use --help for all short names
 82 | 
 83 | ```
 84 | 
 85 | `main_parse_and_eval.py` will generate `parsed_output.json` and `result.json` in the subfolder under the same category with output.json, respectively.
 86 | 
 87 | ```
 88 | ├── Accounting
 89 | │   ├── output.json
 90 | │   ├── parsed_output.json
 91 | │   └── result.json
 92 | └── Electronics
 93 |     ├── output.json
 94 |     ├── parsed_output.json
 95 |     └── result.json
 96 | ...
 97 | ```
 98 | 
 99 | ### Print Results
100 | You can print results locally if you want. (use `pip install tabulate` if you haven't)
101 | ```
102 | python print_results.py --path ./example_outputs/llava1.5_13b
103 | # Results may be slightly different due to the ramdon selection for fail response
104 | ```
105 | 
106 | 
107 | 
108 | ##### Run Llava
109 | In case if you want to reproduce the results of some of the models, please go check run_llava.py as an example.
110 | 
111 | By seeting up the env following the [llava official repo](https://github.com/haotian-liu/LLaVA) and installing `datasets` packages by huggingface, you can run llava viathe following command:
112 | 
113 | ```
114 | CUDA_VISIBLE_DEVICES=0 nohup python run_llava.py \
115 | --output_path example_outputs/llava1.5_13b_val.json \
116 | --model_path liuhaotian/llava-v1.5-13b \
117 | --config_path configs/llava1.5.yaml
118 | ```
119 | 
120 | Then you can evaluate the results via the very first pipeline.
121 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/configs/llava1.5.yaml:
--------------------------------------------------------------------------------
 1 | task_instructions:
 2 | - ""
 3 | multi_choice_example_format:
 4 | - "{}
 5 | 
 6 | {}
 7 | 
 8 | Answer with the option's letter from the given choices directly."
 9 | 
10 | short_ans_example_format:
11 | - "{}
12 | 
13 | Answer the question using a single word or phrase."
14 | temperature:
15 | - 0


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/convert_to_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from argparse import ArgumentParser
 4 | 
 5 | from utils.eval_utils import evaluate
 6 | from utils.data_utils import save_json
 7 | 
 8 | 
 9 | def main():
10 |     parser = ArgumentParser()
11 |     parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 |                         help='name of saved json')
13 |     parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 |                         help='name of saved json')
15 | 
16 |     args = parser.parse_args()
17 |     out_samples = [json.loads(line) for line in open(args.result_file)]
18 |     out_json = {}
19 |     for _sample in out_samples:
20 |         _result = _sample['parsed_pred']
21 |         if isinstance(_result, list):
22 |             _result = str(_result[0])
23 |         out_json[_sample['id']] = _result
24 |     
25 |     save_json(args.output_path, out_json)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from argparse import ArgumentParser
 4 | 
 5 | from utils.eval_utils import evaluate
 6 | from utils.data_utils import save_json
 7 | 
 8 | 
 9 | def main():
10 |     parser = ArgumentParser()
11 |     parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 |                         help='name of saved json')
13 |     parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 |                         help='name of saved json')
15 | 
16 |     args = parser.parse_args()
17 |     out_samples = [json.loads(line) for line in open(args.result_file)]
18 |     
19 |     judge_dict, metric_dict = evaluate(out_samples)
20 |     metric_dict.update({"num_example": len(out_samples)})
21 |     judge_dict['metric_dict'] = metric_dict
22 |     save_dir = '/'.join(args.output_path.split('/')[:-1])
23 |     if not os.path.exists(save_dir):
24 |         os.makedirs(save_dir)
25 |     save_json(args.output_path, judge_dict)
26 | 
27 |     print(metric_dict)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/main_eval_only.py:
--------------------------------------------------------------------------------
 1 | """Parse and Evalate"""
 2 | import os
 3 | import json
 4 | 
 5 | import pdb
 6 | from argparse import ArgumentParser
 7 | 
 8 | from utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
 9 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc
10 | 
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = ArgumentParser()
15 |     parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.")
16 |     parser.add_argument('--answer_path', type=str, default="./answer_dict_val.json", help="Answer file path.")
17 |     args = parser.parse_args()
18 | 
19 |     output_dict = json.load(open(args.output_path))
20 |     answer_dict = json.load(open(args.answer_path))
21 | 
22 |     # group by category
23 |     output_dict_w_cat = {}
24 |     for data_id, parsed_pred in output_dict.items():
25 |         category = "_".join(data_id.split("_")[1:-1])
26 |         if category not in output_dict_w_cat:
27 |             output_dict_w_cat.update({category: {}})
28 |         output_dict_w_cat[category].update({data_id: parsed_pred})
29 | 
30 |     # group by category
31 |     answer_dict_w_cat = {}
32 |     for data_id, parsed_pred in answer_dict.items():
33 |         category = "_".join(data_id.split("_")[1:-1])
34 |         if category not in answer_dict_w_cat:
35 |             answer_dict_w_cat.update({category: {}})
36 |         answer_dict_w_cat[category].update({data_id: parsed_pred})
37 | 
38 |     evaluation_result = {}
39 | 
40 |     for category in CAT_SHORT2LONG.values():
41 |         print("Evaluating: {}".format(category))
42 |         # get cat_outputs and cat_answers
43 |         try:
44 |             cat_outputs = output_dict_w_cat[category]
45 |             cat_answers = answer_dict_w_cat[category]
46 |         except KeyError:
47 |             print("Skipping {} for not found".format(category))
48 |             continue
49 |         
50 |         exampels_to_eval = []
51 |         for data_id, parsed_pred in cat_outputs.items():
52 |             question_type = cat_answers[data_id]['question_type']
53 |             if question_type != 'multiple-choice':
54 |                 parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
55 |             else:
56 |                 parsed_pred = parsed_pred
57 | 
58 |             exampels_to_eval.append({
59 |                 "id": data_id,
60 |                 "question_type": question_type,
61 |                 "answer": cat_answers[data_id]['ground_truth'],
62 |                 "parsed_pred": parsed_pred
63 |             })
64 | 
65 |         judge_dict, metric_dict = evaluate(exampels_to_eval)
66 |         metric_dict.update({"num_example": len(exampels_to_eval)})
67 | 
68 |         evaluation_result[category] = metric_dict
69 | 
70 |     printable_results = {}
71 |     # pdb.set_trace()
72 |     # add domain Subject
73 |     for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
74 |         in_domain_cat_results = {}
75 |         for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
76 |             if cat_name in evaluation_result.keys():
77 |                 in_domain_cat_results[cat_name] = evaluation_result[cat_name]
78 |             else:
79 |                 pass
80 |         in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
81 |         in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
82 |         printable_results['Overall-' + domain] = {"num": int(in_domain_data_num),
83 |                                                   "acc": round(in_domain_ins_acc, 3)
84 |                                                   }
85 |         # add sub category
86 |         for cat_name, cat_results in in_domain_cat_results.items():
87 |             printable_results[cat_name] = {"num": int(cat_results['num_example']),
88 |                                            "acc": round(cat_results['acc'], 3)
89 |                                            }
90 |         
91 |     # table.append(["-----------------------------", "-----", "----"])
92 |     all_ins_acc = calculate_ins_level_acc(evaluation_result)
93 |     printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
94 |                                     "acc": round(all_ins_acc, 3)
95 |                                     }
96 | 
97 |     print(printable_results)
98 | 
99 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/main_parse_and_eval.py:
--------------------------------------------------------------------------------
 1 | """Parse and Evalate"""
 2 | import os
 3 | import json
 4 | from argparse import ArgumentParser
 5 | 
 6 | from utils.data_utils import save_json, CAT_SHORT2LONG
 7 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     parser = ArgumentParser()
13 |     parser.add_argument('--path', type=str, default="./example_outputs/llava1.5_13b", help="The path to model output directory.")
14 |     parser.add_argument('--subject', nargs='+',
15 |                         help=f'The name of the mmmu sub-category. Availble: {CAT_SHORT2LONG.keys()} or ALL')
16 | 
17 |     args = parser.parse_args()
18 |     if args.subject[0] == 'ALL':
19 |         args.subject = CAT_SHORT2LONG.keys()
20 |     
21 |     ex_output_path = os.path.join(args.path)
22 | 
23 |     all_results = {}
24 |     for cat_short in args.subject:
25 |         category = CAT_SHORT2LONG[cat_short]
26 |         print("Evaluating: {}".format(category))
27 |         if category not in os.listdir(ex_output_path):
28 |             print("Skipping {} for not found".format(category))
29 |         else:
30 |             cat_folder_path = os.path.join(ex_output_path, category)
31 |             cat_outputs = json.load(open(os.path.join(cat_folder_path, 'output.json')))
32 |             # Evaluation
33 |             eval_samples = []
34 |             for cat_output in cat_outputs:
35 |                 response = cat_output['response']
36 |                 if cat_output['question_type'] == 'multiple-choice':                    
37 |                     all_choices = cat_output['all_choices']
38 |                     index2ans = cat_output['index2ans']
39 |                     parsed_pred = parse_multi_choice_response(response, all_choices, index2ans)
40 |                     eval_samples.append(
41 |                         {
42 |                             'id': cat_output['id'],
43 |                             'question_type': cat_output['question_type'],
44 |                             'answer': cat_output['answer'], # the content in option, not answer index.
45 |                             'response': response,
46 |                             'parsed_pred': parsed_pred,
47 |                             'index2ans': index2ans,
48 |                         }
49 |                     )
50 |                 else:  # open
51 |                     parsed_pred = parse_open_response(response)
52 |                     eval_samples.append(
53 |                         {
54 |                             'id': cat_output['id'],
55 |                             'question_type': cat_output['question_type'],
56 |                             'answer': cat_output['answer'],
57 |                             'response': response,
58 |                             'parsed_pred': parsed_pred,
59 |                         }
60 |                     )
61 | 
62 |             print("Num of valid samples: {}, Expected Num: {}".format(len(eval_samples), len(cat_outputs)))
63 |             
64 |             judge_dict, metric_dict = evaluate(eval_samples)
65 |             metric_dict.update({"num_example": len(eval_samples)})
66 |             for eval_sample in eval_samples:
67 |                 eval_sample.update({"judge": judge_dict[eval_sample['id']]})
68 | 
69 |             save_json(os.path.join(cat_folder_path, 'parsed_output.json'), eval_samples)
70 |             save_json(os.path.join(cat_folder_path, 'result.json'), metric_dict)
71 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/print_results.py:
--------------------------------------------------------------------------------
 1 | # Beautiful table to print results of all categories
 2 | 
 3 | import os
 4 | from typing import Dict
 5 | import json
 6 | import numpy as np
 7 | from tabulate import tabulate
 8 | 
 9 | from argparse import ArgumentParser
10 | 
11 | from utils.data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
12 | 
13 | from utils.eval_utils import calculate_ins_level_acc
14 | 
15 | def main():
16 |     parser = ArgumentParser()
17 |     parser.add_argument('--path', type=str, default="./example_outputs/blip2_flant5xxl", help="The path to output directory.")
18 |     args = parser.parse_args()
19 | 
20 |     # load all results
21 |     all_results = {}
22 |     for cat_folder_name in os.listdir(args.path):
23 |         if cat_folder_name in CAT_SHORT2LONG.values():
24 |             cat_folder_path = os.path.join(args.path, cat_folder_name)
25 |             result_path = os.path.join(cat_folder_path, 'result.json')
26 |             if os.path.exists(result_path):
27 |                 cat_results = json.load(open(result_path))
28 |                 all_results[cat_folder_name] = cat_results
29 | 
30 |     # print results
31 |     headers = ['Subject', 'Data Num', 'Acc']
32 |     table = []
33 | 
34 |     # add domain Subject
35 |     for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
36 |         in_domain_cat_results = {}
37 |         for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
38 |             if cat_name in all_results.keys():
39 |                 in_domain_cat_results[cat_name] = all_results[cat_name]
40 |             else:
41 |                 pass
42 |         in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
43 |         in_domain_data_num = np.sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
44 |         table.append(['Overall-' + domain, int(in_domain_data_num), round(in_domain_ins_acc, 3)])
45 |         # add sub category
46 |         for cat_name, cat_results in in_domain_cat_results.items():
47 |             table.append([cat_name, int(cat_results['num_example']), round(cat_results['acc'], 3)])
48 |         # table.append(["-----------------------------", "-----", "----"])
49 | 
50 |     # table.append(["-----------------------------", "-----", "----"])
51 |     all_ins_acc = calculate_ins_level_acc(all_results)
52 |     table.append(['Overall', np.sum([cat_results['num_example'] for cat_results in all_results.values()]), round(all_ins_acc, 3)])
53 | 
54 |     print(tabulate(table, headers=headers, tablefmt='orgtbl'))
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/run_llava.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | import random
  4 | 
  5 | import numpy as np
  6 | import math
  7 | from tqdm import tqdm
  8 | import json
  9 | 
 10 | from datasets import load_dataset, concatenate_datasets
 11 | from argparse import ArgumentParser
 12 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 13 | from llava.model import *
 14 | from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
 15 | from utils.data_utils import load_yaml, construct_prompt, save_json, process_single_sample, CAT_SHORT2LONG
 16 | from utils.model_utils_ind import call_llava_engine_df
 17 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response
 18 | import torch.nn.functional as F
 19 | from functools import partial
 20 | from llava.patch_divide import Image_Patch
 21 | from torchvision.transforms import Compose, ToTensor, Normalize
 22 | 
 23 | def set_seed(seed_value):
 24 |     """
 25 |     Set the seed for PyTorch (both CPU and CUDA), Python, and NumPy for reproducible results.
 26 | 
 27 |     :param seed_value: An integer value to be used as the seed.
 28 |     """
 29 |     torch.manual_seed(seed_value)
 30 |     if torch.cuda.is_available():
 31 |         torch.cuda.manual_seed(seed_value)
 32 |         torch.cuda.manual_seed_all(seed_value)  # For multi-GPU setups
 33 |     random.seed(seed_value)
 34 |     np.random.seed(seed_value)
 35 |     torch.backends.cudnn.deterministic = True
 36 |     torch.backends.cudnn.benchmark = False
 37 | 
 38 | def split_list(lst, n):
 39 |     """Split a list into n (roughly) equal-sized chunks"""
 40 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 41 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 42 | 
 43 | 
 44 | def get_chunk(lst, n, k):
 45 |     chunks = split_list(lst, n)
 46 |     return chunks[k]
 47 | 
 48 | 
 49 | def main():
 50 |     parser = ArgumentParser()
 51 |     # parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
 52 |     #                     help='name of saved json')
 53 |     parser.add_argument('--config_path', type=str, default="configs/llava1.5.yaml")
 54 |     parser.add_argument('--data_path', type=str, default="MMMU/MMMU") # hf dataset path.
 55 |     parser.add_argument('--model_path', type=str, default="liuhaotian/llava-v1.5-13b")
 56 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
 57 |     parser.add_argument("--num-chunks", type=int, default=1)
 58 |     parser.add_argument("--chunk-idx", type=int, default=0)
 59 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
 60 |     parser.add_argument('--split', type=str, default='validation')
 61 |     parser.add_argument('--seed', type=int, default=42)
 62 |     parser.add_argument('--load_8bit', type=bool, default=False)
 63 | 
 64 |     args = parser.parse_args()
 65 |     # device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
 66 |     set_seed(args.seed)
 67 | 
 68 |     print('llava_initializing...')
 69 |     processor = None
 70 |     call_model_engine = call_llava_engine_df
 71 | 
 72 |     # load config and process to one value
 73 |     args.config = load_yaml(args.config_path)
 74 |     for key, value in args.config.items():
 75 |         if key != 'eval_params' and type(value) == list:
 76 |             assert len(value) == 1, 'key {} has more than one value'.format(key)
 77 |             args.config[key] = value[0]
 78 | 
 79 |     model_path = os.path.expanduser(args.model_path)
 80 |     model_name = get_model_name_from_path(model_path)
 81 |     tokenizer = AutoTokenizer.from_pretrained(
 82 |         args.model_path,
 83 |         model_max_length = 2048,
 84 |         padding_side="right",
 85 |         use_fast = True
 86 |     )
 87 |     model = LlavaLlamaForCausalLM.from_pretrained(
 88 |         args.model_path,   
 89 |         torch_dtype=torch.bfloat16,
 90 |     ).cuda()
 91 | 
 92 |     for m in model.modules():
 93 |         m.tokenizer = tokenizer
 94 | 
 95 |     vision_tower = model.get_vision_tower()
 96 |     if not vision_tower.is_loaded:
 97 |         vision_tower.load_model()
 98 |     vision_tower.to(device='cuda', dtype=torch.float16)
 99 |     image_processor = vision_tower.image_processor
100 | 
101 |     patch_num = getattr(model.config, 'patch_num', '9')
102 |     image_patch = Image_Patch(patch_num=int(patch_num))
103 |     preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
104 | 
105 | 
106 |     # run for each subject
107 |     sub_dataset_list = []
108 |     subjects = [x for x in CAT_SHORT2LONG.values()]
109 |     '''
110 |     subjects = [
111 |         'Architecture_and_Engineering', 'Computer_Science', 'Electronics',
112 |         'Energy_and_Power', 'Materials', 'Mechanical_Engineering'
113 |     ]
114 |     '''
115 |     for subject in tqdm(subjects):
116 |         sub_dataset = load_dataset(args.data_path, subject, split=args.split)
117 |         sub_dataset_list.append(sub_dataset)
118 | 
119 |     sub_dataset_list = get_chunk(sub_dataset_list, args.num_chunks, args.chunk_idx)
120 | 
121 |     # merge all dataset
122 |     dataset = concatenate_datasets(sub_dataset_list)
123 | 
124 |     # samples = []
125 |     out_samples = []
126 |     for sample in tqdm(dataset):
127 |         sample = process_single_sample(sample)
128 | 
129 |         sample = construct_prompt(sample, args.config)
130 |         if sample['image']:
131 |             image = sample['image'].convert('RGB')
132 |             if model.config.image_aspect_ratio == 'slice':
133 |                 image = preprocess(image)
134 |                 image = image.unsqueeze(0)
135 |                 h, w = image.shape[-2:]
136 |                 block_size = 336
137 |                 h_block, w_block = image_patch.calculate(h, w)
138 |                 h_ratio = block_size*h_block/h
139 |                 w_ratio = block_size*w_block/w
140 |                 if h_ratio<=w_ratio:
141 |                     w_ = min(block_size*w_block, round(w*h_ratio))
142 |                     h_ = block_size*h_block
143 |                 else:
144 |                     w_ = block_size*w_block
145 |                     h_ = min(block_size*h_block, round(h*w_ratio))
146 |                 image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
147 |                 image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
148 |                 image[:, :, :h_, :w_] = image_inter
149 | 
150 |                 split_images = []
151 |                 for i_ in range(h_block):
152 |                     for j_ in range(w_block):
153 |                         image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
154 |                         split_images.append(image_s)
155 |                 if len(split_images)>1:
156 |                     h_ratio = block_size/h
157 |                     w_ratio = block_size/w
158 |                     if h_ratio<=w_ratio:
159 |                         w_ = min(block_size, round(w*h_ratio))
160 |                         h_ = block_size
161 |                     else:
162 |                         w_ = block_size
163 |                         h_ = min(block_size, round(h*w_ratio))
164 |                     image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
165 |                     image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
166 |                     image_s[:, :, :h_, :w_] = image_inter
167 |                     split_images.append(image_s)
168 |                 image_tensor = torch.cat(split_images, dim=0)
169 |             else:
170 |                 image_tensor = process_images([image], image_processor, model.config)[0]
171 |                 image_tensor = image_tensor.unsqueeze(0)
172 |                 h_block = 1
173 |                 w_block = 1
174 | 
175 |             sample['image'] = image_tensor
176 |             
177 |         # samples.append(sample)
178 |         mode = model.config.image_aspect_ratio
179 |         with torch.no_grad():
180 |             response = call_model_engine(args, sample, model, tokenizer, processor, h_block, w_block, mode)
181 |             if sample['question_type'] == 'multiple-choice':
182 |                 parsed_pred = parse_multi_choice_response(response, sample['all_choices'], sample['index2ans'])
183 |                 out_sample = {
184 |                     'id': sample['id'],
185 |                     'question_type': sample['question_type'],
186 |                     'answer': sample['answer'],
187 |                     'response': response,
188 |                     'parsed_pred': parsed_pred,
189 |                     'index2ans': sample['index2ans'],
190 |                 }
191 |             else:  # open question
192 |                 parsed_pred = parse_open_response(response)
193 |                 out_sample = {
194 |                     'id': sample['id'],
195 |                     'question_type': sample['question_type'],
196 |                     'answer': sample['answer'],
197 |                     'response': response,
198 |                     'parsed_pred': parsed_pred,
199 |                 }
200 |             out_samples.append(out_sample)
201 | 
202 |     answers_file = os.path.expanduser(args.answers_file)
203 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
204 |     ans_file = open(answers_file, "w")
205 |     for i, sample in enumerate(out_samples):
206 |         ans_file.write(json.dumps(sample) + "\n")
207 |     ans_file.close()
208 | 
209 | if __name__ == '__main__':
210 |     main()
211 | 
212 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/data_utils.py:
--------------------------------------------------------------------------------
  1 | """Utils for data load, save, and process (e.g., prompt construction)"""
  2 | 
  3 | import os
  4 | import json
  5 | import yaml
  6 | import re
  7 | 
  8 | 
  9 | DOMAIN_CAT2SUB_CAT = {
 10 |   'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
 11 |   'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
 12 |   'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
 13 |   'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
 14 |   'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
 15 |   'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
 16 | }
 17 | 
 18 | 
 19 | CAT_SHORT2LONG = {
 20 |     'acc': 'Accounting',
 21 |     'agri': 'Agriculture',
 22 |     'arch': 'Architecture_and_Engineering',
 23 |     'art': 'Art',
 24 |     'art_theory': 'Art_Theory',
 25 |     'bas_med': 'Basic_Medical_Science',
 26 |     'bio': 'Biology',
 27 |     'chem': 'Chemistry',
 28 |     'cli_med': 'Clinical_Medicine',
 29 |     'cs': 'Computer_Science',
 30 |     'design': 'Design',
 31 |     'diag_med': 'Diagnostics_and_Laboratory_Medicine',
 32 |     'econ': 'Economics',
 33 |     'elec': 'Electronics',
 34 |     'ep': 'Energy_and_Power',
 35 |     'fin': 'Finance',
 36 |     'geo': 'Geography',
 37 |     'his': 'History',
 38 |     'liter': 'Literature',
 39 |     'manage': 'Manage',
 40 |     'mark': 'Marketing',
 41 |     'mate': 'Materials',
 42 |     'math': 'Math',
 43 |     'mech': 'Mechanical_Engineering',
 44 |     'music': 'Music',
 45 |     'phar': 'Pharmacy',
 46 |     'phys': 'Physics',
 47 |     'psy': 'Psychology',
 48 |     'pub_health': 'Public_Health',
 49 |     'socio': 'Sociology'
 50 | }
 51 | 
 52 | # DATA SAVING
 53 | def save_json(filename, ds):
 54 |     with open(filename, 'w') as f:
 55 |         json.dump(ds, f, indent=4)
 56 | 
 57 | 
 58 | def get_multi_choice_info(options):
 59 |     """
 60 |     Given the list of options for multiple choice question
 61 |     Return the index2ans and all_choices
 62 |     """
 63 |     
 64 |     start_chr = 'A'
 65 |     all_choices = []
 66 |     index2ans = {}
 67 |     for i, option in enumerate(options):
 68 |         index2ans[chr(ord(start_chr) + i)] = option
 69 |         all_choices.append(chr(ord(start_chr) + i))
 70 | 
 71 |     return index2ans, all_choices
 72 | 
 73 | def load_yaml(file_path):
 74 |     with open(file_path, 'r') as stream:
 75 |         try:
 76 |             yaml_dict = yaml.safe_load(stream)
 77 |         except yaml.YAMLError as exc:
 78 |             print(exc)
 79 | 
 80 |     return yaml_dict
 81 | 
 82 | 
 83 | def parse_img_path(text):
 84 |     matches = re.findall("<img='(.*?)'>", text)
 85 |     return matches
 86 | 
 87 | def process_single_sample(data):
 88 |     question = data['question']
 89 |     o_imgs_paths = []
 90 |     for option in data['options']:
 91 |         current_o_imgs_paths = parse_img_path(option)
 92 |         for img_path in current_o_imgs_paths:
 93 |             o_imgs_paths.append(img_path)
 94 | 
 95 |     if len(o_imgs_paths) > 1:  # multiple images in options, used for random selection
 96 |         return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
 97 |              'image': None, 'question_type': data['question_type']}
 98 |     else:
 99 |         return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
100 |              'image': data['image_1'], 'question_type': data['question_type']}
101 | 
102 | 
103 | # DATA SAVING
104 | def save_json(filename, ds):
105 |     with open(filename, 'w') as f:
106 |         json.dump(ds, f, indent=4)
107 | 
108 | def save_jsonl(filename, data):
109 |     """
110 |     Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
111 | 
112 |     Args:
113 |         filename (str): The path to the file where the data should be saved.
114 |         data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
115 |     """
116 |     with open(filename, 'w', encoding='utf-8') as f:
117 |         for img_path, caption in data.items():
118 |             # Extract the base filename without the extension
119 |             base_filename = os.path.basename(img_path)
120 |             # Create a JSON object with the filename as the key and caption as the value
121 |             json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
122 |             # Write the JSON object to the file, one per line
123 |             f.write(json_record + '\n')
124 | 
125 | def save_args(args, path_dir):
126 |     argsDict = args.__dict__
127 |     with open(path_dir + 'setting.txt', 'w') as f:
128 |         f.writelines('------------------ start ------------------' + '\n')
129 |         for eachArg, value in argsDict.items():
130 |             f.writelines(eachArg + ' : ' + str(value) + '\n')
131 |         f.writelines('------------------- end -------------------')
132 | 
133 | 
134 | 
135 | # DATA PROCESSING
136 | def construct_prompt(sample, config):
137 |     question = sample['question']
138 |     options = eval(sample['options'])
139 |     example = ""
140 |     if sample['question_type'] == 'multiple-choice':
141 |         start_chr = 'A'
142 |         prediction_range = []
143 |         index2ans = {}
144 |         for option in options:
145 |             prediction_range.append(start_chr)
146 |             example += f"({start_chr}) {option}\n"
147 |             index2ans[start_chr] = option
148 |             start_chr = chr(ord(start_chr) + 1)
149 |         empty_prompt_sample_structure = config['multi_choice_example_format']
150 |         empty_prompt = empty_prompt_sample_structure.format(question, example)
151 |         res_dict = {}
152 |         res_dict['index2ans'] = index2ans
153 |         res_dict['correct_choice'] = sample['answer']
154 |         res_dict['all_choices'] = prediction_range
155 |         res_dict['empty_prompt'] = empty_prompt
156 |         if config['task_instructions']:
157 |             res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
158 |         else:
159 |             res_dict['final_input_prompt'] = empty_prompt
160 | 
161 |         res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
162 |     else:
163 |         empty_prompt_sample_structure = config['short_ans_example_format']
164 |         empty_prompt = empty_prompt_sample_structure.format(question)
165 |         res_dict = {}
166 |         res_dict['empty_prompt'] = empty_prompt
167 |         if config['task_instructions']:
168 |             res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
169 |         else:
170 |             res_dict['final_input_prompt'] = empty_prompt
171 |         res_dict['gt_content'] = sample['answer']
172 | 
173 |     res_dict.update(sample)
174 |     return res_dict


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | import torch
 3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 4 | from llava.conversation import conv_templates, SeparatorStyle
 5 | 
 6 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None):
 7 | 
 8 |     def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 9 |         prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
10 | 
11 |         def insert_separator(X, sep):
12 |             return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
13 | 
14 |         input_ids = []
15 |         offset = 0
16 |         if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
17 |             offset = 1
18 |             input_ids.append(prompt_chunks[0][0])
19 | 
20 |         for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
21 |             input_ids.extend(x[offset:])
22 | 
23 |         if return_tensors is not None:
24 |             if return_tensors == 'pt':
25 |                 return torch.tensor(input_ids, dtype=torch.long)
26 |             raise ValueError(f'Unsupported tensor type: {return_tensors}')
27 |         return input_ids
28 | 
29 |     def deal_with_prompt(input_text, mm_use_im_start_end):
30 |         qs = input_text
31 |         if mm_use_im_start_end:
32 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
33 |         else:
34 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
35 |         return qs
36 | 
37 |     prompt = sample['final_input_prompt']
38 |     prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end)
39 |     conv = conv_templates['vicuna_v1'].copy()
40 |     conv.append_message(conv.roles[0], prompt)
41 |     conv.append_message(conv.roles[1], None)
42 |     prompt = conv.get_prompt()
43 |     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
44 |     image = sample['image']
45 |     if image is not None:
46 |         output_ids = model.generate(
47 |             input_ids,
48 |             images=image.unsqueeze(0).half().cuda(),
49 |             do_sample=True,
50 |             temperature=1,
51 |             top_p=None,
52 |             num_beams=5,
53 |             max_new_tokens=128,
54 |             use_cache=True)
55 | 
56 |         # input_token_len = input_ids.shape[1]
57 |         # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
58 |         # if n_diff_input_output > 0:
59 |         #     print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
60 |         # response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
61 |         response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
62 |     else:  # multiple images actually
63 |         if sample['question_type'] == 'multiple-choice':
64 |             all_choices = sample['all_choices']
65 |             response = random.choice(all_choices)
66 |         else:
67 |             response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
68 | 
69 |     return response
70 | 
71 | 
72 | def llava_image_processor(raw_image, vis_processors=None):
73 |     image_tensor = vis_processors.preprocess(raw_image, return_tensors='pt')['pixel_values'][0]
74 |     return image_tensor
75 | 


--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/model_utils_ind.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | import torch
 3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 4 | from llava.conversation import conv_templates, SeparatorStyle
 5 | from functools import partial
 6 | from llava.mm_utils import tokenizer_image_token
 7 | 
 8 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None, h_block=None, w_block=None, mode=None):
 9 | 
10 |     def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens):
11 |         if ocr_tokens is not None:
12 |             qs = input_text + '\n' + ocr_tokens
13 |         else:
14 |             qs = input_text
15 |         if mm_use_im_start_end:
16 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
17 |         else:
18 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
19 |         return qs
20 | 
21 |     prompt = sample['final_input_prompt']
22 |     ocr_tokens = sample.get('ocr', None)
23 |     prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens)
24 |     conv = conv_templates[args.conv_mode].copy()
25 |     conv.append_message(conv.roles[0], prompt)
26 |     conv.append_message(conv.roles[1], None)
27 |     prompt = conv.get_prompt()
28 |     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
29 |     image = sample['image']
30 | 
31 |     if image is not None:
32 |         model.orig_forward = model.forward
33 |         model.forward = partial(model.orig_forward,
34 |                             mode=mode,
35 |                             h_block = [h_block],
36 |                             w_block = [w_block]
37 |                             )
38 |         output_ids = model.generate(
39 |             input_ids,
40 |             images=image.bfloat16().cuda(),
41 |             do_sample=False,
42 |             temperature=0,
43 |             num_beams=1,
44 |             top_p=None,
45 |             max_new_tokens=1024,
46 |             use_cache=True)
47 | 
48 |         model.forward = model.orig_forward
49 | 
50 |         input_token_len = input_ids.shape[1]
51 | 
52 |         response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
53 |         # response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n')
54 |     else:  # multiple images actually
55 |         if sample['question_type'] == 'multiple-choice':
56 |             all_choices = sample['all_choices']
57 |             response = random.choice(all_choices)
58 |         else:
59 |             response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
60 | 
61 |     return response
62 | 


--------------------------------------------------------------------------------
/llava/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
 3 | import torch
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | import shortuuid
 8 | 
 9 | from llava.conversation import default_conversation
10 | from llava.utils import disable_torch_init
11 | 
12 | 
13 | # new stopping implementation
14 | class KeywordsStoppingCriteria(StoppingCriteria):
15 |     def __init__(self, keywords, tokenizer, input_ids):
16 |         self.keywords = keywords
17 |         self.tokenizer = tokenizer
18 |         self.start_len = None
19 |         self.input_ids = input_ids
20 | 
21 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
22 |         if self.start_len is None:
23 |             self.start_len = self.input_ids.shape[1]
24 |         else:
25 |             outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
26 |             for keyword in self.keywords:
27 |                 if keyword in outputs:
28 |                     return True
29 |         return False
30 | 
31 | 
32 | @torch.inference_mode()
33 | def eval_model(model_name, questions_file, answers_file):
34 |     # Model
35 |     disable_torch_init()
36 |     model_name = os.path.expanduser(model_name)
37 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
38 |     model = AutoModelForCausalLM.from_pretrained(model_name,
39 |         torch_dtype=torch.float16).cuda()
40 | 
41 | 
42 |     ques_file = open(os.path.expanduser(questions_file), "r")
43 |     ans_file = open(os.path.expanduser(answers_file), "w")
44 |     for i, line in enumerate(tqdm(ques_file)):
45 |         idx = json.loads(line)["question_id"]
46 |         qs = json.loads(line)["text"]
47 |         cat = json.loads(line)["category"]
48 |         conv = default_conversation.copy()
49 |         conv.append_message(conv.roles[0], qs)
50 |         prompt = conv.get_prompt()
51 |         inputs = tokenizer([prompt])
52 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
53 |         stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids)
54 |         output_ids = model.generate(
55 |             input_ids,
56 |             do_sample=True,
57 |             use_cache=True,
58 |             temperature=0.7,
59 |             max_new_tokens=1024,
60 |             stopping_criteria=[stopping_criteria])
61 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
62 |         try:
63 |             index = outputs.index(conv.sep, len(prompt))
64 |         except ValueError:
65 |             outputs += conv.sep
66 |             index = outputs.index(conv.sep, len(prompt))
67 | 
68 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
69 |         ans_id = shortuuid.uuid()
70 |         ans_file.write(json.dumps({"question_id": idx,
71 |                                    "text": outputs,
72 |                                    "answer_id": ans_id,
73 |                                    "model_id": model_name,
74 |                                    "metadata": {}}) + "\n")
75 |         ans_file.flush()
76 |     ans_file.close()
77 | 
78 | if __name__ == "__main__":
79 |     parser = argparse.ArgumentParser()
80 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
81 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
82 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
83 |     args = parser.parse_args()
84 | 
85 |     eval_model(args.model_name, args.question_file, args.answers_file)
86 | 


--------------------------------------------------------------------------------
/llava/eval/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.utils import disable_torch_init
 11 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path, KeywordsStoppingCriteria
 12 | 
 13 | from PIL import Image
 14 | import math
 15 | import torch.nn.functional as F
 16 | from functools import partial
 17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 18 | from llava.model import *
 19 | from llava.patch_divide import Image_Patch
 20 | from torchvision.transforms import Compose, ToTensor, Normalize
 21 | 
 22 | 
 23 | def split_list(lst, n):
 24 |     """Split a list into n (roughly) equal-sized chunks"""
 25 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 26 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 27 | 
 28 | 
 29 | def get_chunk(lst, n, k):
 30 |     chunks = split_list(lst, n)
 31 |     return chunks[k]
 32 | 
 33 | 
 34 | def eval_model(args):
 35 |     # Model
 36 |     disable_torch_init()
 37 |     model_path = os.path.expanduser(args.model_path)
 38 |     model_name = get_model_name_from_path(model_path)
 39 |     tokenizer = AutoTokenizer.from_pretrained(
 40 |         model_path,
 41 |         model_max_length = 2048,
 42 |         padding_side="right",
 43 |         use_fast = True
 44 |     )
 45 | 
 46 |     model = LlavaLlamaForCausalLM.from_pretrained(
 47 |         model_path,   
 48 |         torch_dtype=torch.bfloat16,
 49 |     ).cuda()
 50 | 
 51 |     for m in model.modules():
 52 |         m.tokenizer = tokenizer
 53 | 
 54 |     vision_tower = model.get_vision_tower()
 55 |     if not vision_tower.is_loaded:
 56 |         vision_tower.load_model()
 57 |     vision_tower.to(device='cuda', dtype=torch.float16)
 58 |     image_processor = vision_tower.image_processor
 59 | 
 60 |     patch_num = getattr(model.config, 'patch_num', '9')
 61 |     image_patch = Image_Patch(patch_num=int(patch_num))
 62 |     preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
 63 | 
 64 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 65 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 66 |     answers_file = os.path.expanduser(args.answers_file)
 67 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 68 |     ans_file = open(answers_file, "w")
 69 |     for line in tqdm(questions):
 70 |         idx = line["question_id"]
 71 |         image_file = line["image"]
 72 |         qs = line["text"]
 73 |         cur_prompt = qs
 74 |         if model.config.mm_use_im_start_end:
 75 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 76 |         else:
 77 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 78 | 
 79 |         conv = conv_templates[args.conv_mode].copy()
 80 |         conv.append_message(conv.roles[0], qs)
 81 |         conv.append_message(conv.roles[1], None)
 82 |         prompt = conv.get_prompt()
 83 | 
 84 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 85 | 
 86 |         image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
 87 |         if model.config.image_aspect_ratio == 'slice':
 88 |             image = preprocess(image)
 89 |             image = image.unsqueeze(0)
 90 |             h, w = image.shape[-2:]
 91 |             block_size = 336
 92 |             h_block, w_block = image_patch.calculate(h, w)
 93 |             h_ratio = block_size*h_block/h
 94 |             w_ratio = block_size*w_block/w
 95 |             if h_ratio<=w_ratio:
 96 |                 w_ = min(block_size*w_block, round(w*h_ratio))
 97 |                 h_ = block_size*h_block
 98 |             else:
 99 |                 w_ = block_size*w_block
100 |                 h_ = min(block_size*h_block, round(h*w_ratio))
101 |             image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
102 |             image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
103 |             image[:, :, :h_, :w_] = image_inter
104 | 
105 |             split_images = []
106 |             for i_ in range(h_block):
107 |                 for j_ in range(w_block):
108 |                     image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
109 |                     split_images.append(image_s)
110 |             if len(split_images)>1:
111 |                 h_ratio = block_size/h
112 |                 w_ratio = block_size/w
113 |                 if h_ratio<=w_ratio:
114 |                     w_ = min(block_size, round(w*h_ratio))
115 |                     h_ = block_size
116 |                 else:
117 |                     w_ = block_size
118 |                     h_ = min(block_size, round(h*w_ratio))
119 |                 image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
120 |                 image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
121 |                 image_s[:, :, :h_, :w_] = image_inter
122 |                 split_images.append(image_s)
123 |             image_tensor = torch.cat(split_images, dim=0)
124 |         else:
125 |             image_tensor = process_images([image], image_processor, model.config)[0]
126 |             image_tensor = image_tensor.unsqueeze(0)
127 |             h_block = 1
128 |             w_block = 1
129 | 
130 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
131 |         keywords = [stop_str]
132 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
133 |         mode = model.config.image_aspect_ratio
134 | 
135 |         with torch.inference_mode():
136 |             model.orig_forward = model.forward
137 |             model.forward = partial(model.orig_forward,
138 |                                     mode=mode,
139 |                                     h_block=[h_block],
140 |                                     w_block=[w_block]
141 |                                     )
142 | 
143 | 
144 |             output_ids = model.generate(
145 |                 input_ids,
146 |                 images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
147 |                 do_sample=True if args.temperature > 0 else False,
148 |                 temperature=args.temperature,
149 |                 top_p=args.top_p,
150 |                 num_beams=args.num_beams,
151 |                 # no_repeat_ngram_size=3,
152 |                 max_new_tokens=1024,
153 |                 use_cache=True)
154 |             
155 |             model.forward = model.orig_forward
156 | 
157 |         input_token_len = input_ids.shape[1]
158 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
159 |         if n_diff_input_output > 0:
160 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
161 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
162 |         outputs = outputs.strip()
163 |         if outputs.endswith(stop_str):
164 |             outputs = outputs[:-len(stop_str)]
165 |         outputs = outputs.strip()
166 | 
167 |         ans_id = shortuuid.uuid()
168 |         ans_file.write(json.dumps({"question_id": idx,
169 |                                    "prompt": cur_prompt,
170 |                                    "text": outputs,
171 |                                    "answer_id": ans_id,
172 |                                    "model_id": model_name,
173 |                                    "metadata": {}}) + "\n")
174 |         ans_file.flush()
175 |     ans_file.close()
176 | 
177 | if __name__ == "__main__":
178 |     parser = argparse.ArgumentParser()
179 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
180 |     parser.add_argument("--model-base", type=str, default=None)
181 |     parser.add_argument("--image-folder", type=str, default="")
182 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
183 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
184 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
185 |     parser.add_argument("--num-chunks", type=int, default=1)
186 |     parser.add_argument("--chunk-idx", type=int, default=0)
187 |     parser.add_argument("--temperature", type=float, default=0.2)
188 |     parser.add_argument("--top_p", type=float, default=None)
189 |     parser.add_argument("--num_beams", type=int, default=1)
190 |     args = parser.parse_args()
191 | 
192 |     eval_model(args)
193 | 


--------------------------------------------------------------------------------
/llava/eval/run_llava.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 5 | from llava.conversation import conv_templates, SeparatorStyle
 6 | from llava.model.builder import load_pretrained_model
 7 | from llava.utils import disable_torch_init
 8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 9 | 
10 | from PIL import Image
11 | 
12 | import requests
13 | from PIL import Image
14 | from io import BytesIO
15 | 
16 | 
17 | def load_image(image_file):
18 |     if image_file.startswith('http') or image_file.startswith('https'):
19 |         response = requests.get(image_file)
20 |         image = Image.open(BytesIO(response.content)).convert('RGB')
21 |     else:
22 |         image = Image.open(image_file).convert('RGB')
23 |     return image
24 | 
25 | 
26 | def eval_model(args):
27 |     # Model
28 |     disable_torch_init()
29 | 
30 |     model_name = get_model_name_from_path(args.model_path)
31 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
32 | 
33 |     qs = args.query
34 |     if model.config.mm_use_im_start_end:
35 |         qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
36 |     else:
37 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
38 | 
39 |     if 'llama-2' in model_name.lower():
40 |         conv_mode = "llava_llama_2"
41 |     elif "v1" in model_name.lower():
42 |         conv_mode = "llava_v1"
43 |     elif "mpt" in model_name.lower():
44 |         conv_mode = "mpt"
45 |     else:
46 |         conv_mode = "llava_v0"
47 | 
48 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
49 |         print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
50 |     else:
51 |         args.conv_mode = conv_mode
52 | 
53 |     conv = conv_templates[args.conv_mode].copy()
54 |     conv.append_message(conv.roles[0], qs)
55 |     conv.append_message(conv.roles[1], None)
56 |     prompt = conv.get_prompt()
57 | 
58 |     image = load_image(args.image_file)
59 |     image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
60 | 
61 |     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
62 | 
63 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
64 |     keywords = [stop_str]
65 |     stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
66 | 
67 |     with torch.inference_mode():
68 |         output_ids = model.generate(
69 |             input_ids,
70 |             images=image_tensor,
71 |             do_sample=True,
72 |             temperature=0.2,
73 |             max_new_tokens=1024,
74 |             use_cache=True,
75 |             stopping_criteria=[stopping_criteria])
76 | 
77 |     input_token_len = input_ids.shape[1]
78 |     n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
79 |     if n_diff_input_output > 0:
80 |         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
81 |     outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
82 |     outputs = outputs.strip()
83 |     if outputs.endswith(stop_str):
84 |         outputs = outputs[:-len(stop_str)]
85 |     outputs = outputs.strip()
86 |     print(outputs)
87 | 
88 | if __name__ == "__main__":
89 |     parser = argparse.ArgumentParser()
90 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
91 |     parser.add_argument("--model-base", type=str, default=None)
92 |     parser.add_argument("--image-file", type=str, required=True)
93 |     parser.add_argument("--query", type=str, required=True)
94 |     parser.add_argument("--conv-mode", type=str, default=None)
95 |     args = parser.parse_args()
96 | 
97 |     eval_model(args)
98 | 


--------------------------------------------------------------------------------
/llava/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/llava/mm_utils.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | from io import BytesIO
  3 | import base64
  4 | 
  5 | import torch
  6 | from transformers import StoppingCriteria
  7 | from llava.constants import IMAGE_TOKEN_INDEX
  8 | 
  9 | 
 10 | def load_image_from_base64(image):
 11 |     return Image.open(BytesIO(base64.b64decode(image)))
 12 | 
 13 | 
 14 | def expand2square(pil_img, background_color):
 15 |     width, height = pil_img.size
 16 |     if width == height:
 17 |         return pil_img
 18 |     elif width > height:
 19 |         result = Image.new(pil_img.mode, (width, width), background_color)
 20 |         result.paste(pil_img, (0, (width - height) // 2))
 21 |         return result
 22 |     else:
 23 |         result = Image.new(pil_img.mode, (height, height), background_color)
 24 |         result.paste(pil_img, ((height - width) // 2, 0))
 25 |         return result
 26 | 
 27 | 
 28 | def process_images(images, image_processor, model_cfg):
 29 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
 30 |     new_images = []
 31 |     if image_aspect_ratio == 'pad':
 32 |         for image in images:
 33 |             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
 34 |             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 35 |             new_images.append(image)
 36 |     else:
 37 |         return image_processor(images, return_tensors='pt')['pixel_values']
 38 |     if all(x.shape == new_images[0].shape for x in new_images):
 39 |         new_images = torch.stack(new_images, dim=0)
 40 |     return new_images
 41 | 
 42 | 
 43 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 44 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
 45 | 
 46 |     def insert_separator(X, sep):
 47 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
 48 | 
 49 |     input_ids = []
 50 |     offset = 0
 51 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
 52 |         offset = 1
 53 |         input_ids.append(prompt_chunks[0][0])
 54 | 
 55 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
 56 |         input_ids.extend(x[offset:])
 57 | 
 58 |     if return_tensors is not None:
 59 |         if return_tensors == 'pt':
 60 |             return torch.tensor(input_ids, dtype=torch.long)
 61 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
 62 |     return input_ids
 63 | 
 64 | 
 65 | def get_model_name_from_path(model_path):
 66 |     model_path = model_path.strip("/")
 67 |     model_paths = model_path.split("/")
 68 |     if model_paths[-1].startswith('checkpoint-'):
 69 |         return model_paths[-2] + "_" + model_paths[-1]
 70 |     else:
 71 |         return model_paths[-1]
 72 | 
 73 | 
 74 | 
 75 | 
 76 | class KeywordsStoppingCriteria(StoppingCriteria):
 77 |     def __init__(self, keywords, tokenizer, input_ids):
 78 |         self.keywords = keywords
 79 |         self.keyword_ids = []
 80 |         self.max_keyword_len = 0
 81 |         for keyword in keywords:
 82 |             cur_keyword_ids = tokenizer(keyword).input_ids
 83 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
 84 |                 cur_keyword_ids = cur_keyword_ids[1:]
 85 |             if len(cur_keyword_ids) > self.max_keyword_len:
 86 |                 self.max_keyword_len = len(cur_keyword_ids)
 87 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
 88 |         self.tokenizer = tokenizer
 89 |         self.start_len = input_ids.shape[1]
 90 | 
 91 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 92 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
 93 |         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
 94 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
 95 |         for keyword_id in self.keyword_ids:
 96 |             if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
 97 |                 return True
 98 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
 99 |         for keyword in self.keywords:
100 |             if keyword in outputs:
101 |                 return True
102 |         return False


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 | 


--------------------------------------------------------------------------------
/llava/model/builder.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import warnings
 18 | import shutil
 19 | 
 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 21 | import torch
 22 | from llava.model import *
 23 | from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 24 | 
 25 | 
 26 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
 27 |     kwargs = {"device_map": device_map}
 28 | 
 29 |     if load_8bit:
 30 |         kwargs['load_in_8bit'] = True
 31 |     elif load_4bit:
 32 |         kwargs['load_in_4bit'] = True
 33 |         kwargs['quantization_config'] = BitsAndBytesConfig(
 34 |             load_in_4bit=True,
 35 |             bnb_4bit_compute_dtype=torch.float16,
 36 |             bnb_4bit_use_double_quant=True,
 37 |             bnb_4bit_quant_type='nf4'
 38 |         )
 39 |     else:
 40 |         kwargs['torch_dtype'] = torch.float16
 41 | 
 42 |     if 'llava' in model_name.lower():
 43 |         # Load LLaVA model
 44 |         if 'lora' in model_name.lower() and model_base is None:
 45 |             warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
 46 |         if 'lora' in model_name.lower() and model_base is not None:
 47 |             lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
 48 |             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
 49 |             print('Loading LLaVA from base model...')
 50 |             model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 51 |             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
 52 |             if model.lm_head.weight.shape[0] != token_num:
 53 |                 model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 54 |                 model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 55 | 
 56 |             print('Loading additional LLaVA weights...')
 57 |             if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
 58 |                 non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
 59 |             else:
 60 |                 # this is probably from HF Hub
 61 |                 from huggingface_hub import hf_hub_download
 62 |                 def load_from_hf(repo_id, filename, subfolder=None):
 63 |                     cache_file = hf_hub_download(
 64 |                         repo_id=repo_id,
 65 |                         filename=filename,
 66 |                         subfolder=subfolder)
 67 |                     return torch.load(cache_file, map_location='cpu')
 68 |                 non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
 69 |             non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
 70 |             if any(k.startswith('model.model.') for k in non_lora_trainables):
 71 |                 non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
 72 |             model.load_state_dict(non_lora_trainables, strict=False)
 73 | 
 74 |             from peft import PeftModel
 75 |             print('Loading LoRA weights...')
 76 |             model = PeftModel.from_pretrained(model, model_path)
 77 |             print('Merging LoRA weights...')
 78 |             model = model.merge_and_unload()
 79 |             print('Model is loaded...')
 80 |         elif model_base is not None:
 81 |             # this may be mm projector only
 82 |             print('Loading LLaVA from base model...')
 83 |             if 'mpt' in model_name.lower():
 84 |                 if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
 85 |                     shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
 86 |                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
 87 |                 cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 88 |                 model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
 89 |             else:
 90 |                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
 91 |                 cfg_pretrained = AutoConfig.from_pretrained(model_path)
 92 |                 model = LlavaLlamaForCausalLM.from_pretrained(
 93 |                     model_base,   
 94 |                     # torch_dtype=torch.bfloat16,
 95 |                 ).cuda()
 96 |                 # model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
 97 | 
 98 |             mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
 99 |             mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
100 |             model.load_state_dict(mm_projector_weights, strict=False)
101 |         else:
102 |             if 'mpt' in model_name.lower():
103 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
104 |                 model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
105 |             else:
106 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
107 |                 model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
108 |     else:
109 |         # Load language model
110 |         if model_base is not None:
111 |             # PEFT model
112 |             from peft import PeftModel
113 |             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
114 |             model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
115 |             print(f"Loading LoRA weights from {model_path}")
116 |             model = PeftModel.from_pretrained(model, model_path)
117 |             print(f"Merging weights")
118 |             model = model.merge_and_unload()
119 |             print('Convert to FP16...')
120 |             model.to(torch.float16)
121 |         else:
122 |             use_fast = False
123 |             if 'mpt' in model_name.lower():
124 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
125 |                 model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
126 |             else:
127 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
128 |                 model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
129 | 
130 |     image_processor = None
131 | 
132 |     if 'llava' in model_name.lower():
133 |         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
134 |         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
135 |         if mm_use_im_patch_token:
136 |             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
137 |         if mm_use_im_start_end:
138 |             tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
139 |         model.resize_token_embeddings(len(tokenizer))
140 | 
141 |         vision_tower = model.get_vision_tower()
142 |         if not vision_tower.is_loaded:
143 |             vision_tower.load_model()
144 |         vision_tower.to(device=device, dtype=torch.float16)
145 |         image_processor = vision_tower.image_processor
146 | 
147 |     if hasattr(model.config, "max_sequence_length"):
148 |         context_len = model.config.max_sequence_length
149 |     else:
150 |         context_len = 2048
151 | 
152 |     return tokenizer, model, image_processor, context_len
153 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_llama.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import CrossEntropyLoss
 21 | 
 22 | from transformers import AutoConfig, AutoModelForCausalLM, \
 23 |                          LlamaConfig, LlamaModel, LlamaForCausalLM
 24 | 
 25 | from transformers.modeling_outputs import CausalLMOutputWithPast
 26 | 
 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaConfig(LlamaConfig):
 31 |     model_type = "llava"
 32 | 
 33 | 
 34 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
 35 |     config_class = LlavaConfig
 36 | 
 37 |     def __init__(self, config: LlamaConfig):
 38 |         super(LlavaLlamaModel, self).__init__(config)
 39 | 
 40 | 
 41 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
 42 |     config_class = LlavaConfig
 43 | 
 44 |     def __init__(self, config):
 45 |         super(LlamaForCausalLM, self).__init__(config)
 46 |         self.model = LlavaLlamaModel(config)
 47 | 
 48 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 49 | 
 50 |         # Initialize weights and apply final processing
 51 |         self.post_init()
 52 | 
 53 |     def get_model(self):
 54 |         return self.model
 55 | 
 56 |     def forward(
 57 |         self,
 58 |         input_ids: torch.LongTensor = None,
 59 |         attention_mask: Optional[torch.Tensor] = None,
 60 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 61 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 62 |         labels: Optional[torch.LongTensor] = None,
 63 |         use_cache: Optional[bool] = None,
 64 |         output_attentions: Optional[bool] = None,
 65 |         output_hidden_states: Optional[bool] = None,
 66 |         images: Optional[torch.FloatTensor] = None,
 67 |         return_dict: Optional[bool] = None,
 68 |         mode = None,
 69 |         h_block = None,
 70 |         w_block = None
 71 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 72 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 73 |         output_hidden_states = (
 74 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 75 |         )
 76 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 77 | 
 78 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, mode, h_block, w_block)
 79 | 
 80 |         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
 81 |         outputs = self.model(
 82 |             input_ids=input_ids,
 83 |             attention_mask=attention_mask,
 84 |             past_key_values=past_key_values,
 85 |             inputs_embeds=inputs_embeds,
 86 |             use_cache=use_cache,
 87 |             output_attentions=output_attentions,
 88 |             output_hidden_states=output_hidden_states,
 89 |             return_dict=return_dict
 90 |         )
 91 | 
 92 |         hidden_states = outputs[0]
 93 |         logits = self.lm_head(hidden_states)
 94 | 
 95 |         loss = None
 96 |         if labels is not None:
 97 |             # Shift so that tokens < n predict n
 98 |             shift_logits = logits[..., :-1, :].contiguous()
 99 |             shift_labels = labels[..., 1:].contiguous()
100 |             # Flatten the tokens
101 |             loss_fct = CrossEntropyLoss()
102 |             shift_logits = shift_logits.view(-1, self.config.vocab_size)
103 |             shift_labels = shift_labels.view(-1)
104 |             # Enable model/pipeline parallelism
105 |             shift_labels = shift_labels.to(shift_logits.device)
106 |             loss = loss_fct(shift_logits, shift_labels)
107 | 
108 |         if not return_dict:
109 |             output = (logits,) + outputs[1:]
110 |             return (loss,) + output if loss is not None else output
111 | 
112 |         return CausalLMOutputWithPast(
113 |             loss=loss,
114 |             logits=logits,
115 |             past_key_values=outputs.past_key_values,
116 |             hidden_states=outputs.hidden_states,
117 |             attentions=outputs.attentions,
118 |         )
119 | 
120 |     def prepare_inputs_for_generation(
121 |         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
122 |     ):
123 |         if past_key_values:
124 |             input_ids = input_ids[:, -1:]
125 | 
126 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
127 |         if inputs_embeds is not None and past_key_values is None:
128 |             model_inputs = {"inputs_embeds": inputs_embeds}
129 |         else:
130 |             model_inputs = {"input_ids": input_ids}
131 | 
132 |         model_inputs.update(
133 |             {
134 |                 "past_key_values": past_key_values,
135 |                 "use_cache": kwargs.get("use_cache"),
136 |                 "attention_mask": attention_mask,
137 |                 "images": kwargs.get("images", None),
138 |             }
139 |         )
140 |         return model_inputs
141 | 
142 | AutoConfig.register("llava", LlavaConfig)
143 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
144 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple
 17 | import warnings
 18 | 
 19 | import torch
 20 | import torch.nn.functional as F
 21 | import math
 22 | 
 23 | from transformers import AutoConfig, AutoModelForCausalLM
 24 | from transformers.modeling_outputs import CausalLMOutputWithPast
 25 | 
 26 | from .mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel
 27 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 28 | 
 29 | 
 30 | class LlavaMPTConfig(MPTConfig):
 31 |     model_type = "llava_mpt"
 32 | 
 33 | 
 34 | class LlavaMPTModel(LlavaMetaModel, MPTModel):
 35 |     config_class = LlavaMPTConfig
 36 | 
 37 |     def __init__(self, config: MPTConfig):
 38 |         config.hidden_size = config.d_model
 39 |         super(LlavaMPTModel, self).__init__(config)
 40 |     
 41 |     def embed_tokens(self, x):
 42 |         return self.wte(x)
 43 | 
 44 | 
 45 | class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM):
 46 |     config_class = LlavaMPTConfig
 47 |     supports_gradient_checkpointing = True
 48 | 
 49 |     def __init__(self, config):
 50 |         super(MPTForCausalLM, self).__init__(config)
 51 | 
 52 |         if not config.tie_word_embeddings:
 53 |             raise ValueError('MPTForCausalLM only supports tied word embeddings')
 54 |         self.transformer = LlavaMPTModel(config)
 55 |         self.logit_scale = None
 56 |         if config.logit_scale is not None:
 57 |             logit_scale = config.logit_scale
 58 |             if isinstance(logit_scale, str):
 59 |                 if logit_scale == 'inv_sqrt_d_model':
 60 |                     logit_scale = 1 / math.sqrt(config.d_model)
 61 |                 else:
 62 |                     raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
 63 |             self.logit_scale = logit_scale
 64 | 
 65 |     def get_model(self):
 66 |         return self.transformer
 67 | 
 68 |     def _set_gradient_checkpointing(self, module, value=False):
 69 |         if isinstance(module, LlavaMPTModel):
 70 |             module.gradient_checkpointing = value
 71 | 
 72 |     def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
 73 |         return_dict = return_dict if return_dict is not None else self.config.return_dict
 74 |         use_cache = use_cache if use_cache is not None else self.config.use_cache
 75 | 
 76 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
 77 |         outputs = self.transformer(input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
 78 |         # FIXME: this is a hack to fix the multiple gpu inference issue in https://github.com/haotian-liu/LLaVA/issues/338
 79 |         logits = F.linear(outputs.last_hidden_state.to(self.transformer.wte.weight.device), self.transformer.wte.weight)
 80 |         if self.logit_scale is not None:
 81 |             if self.logit_scale == 0:
 82 |                 warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
 83 |             logits *= self.logit_scale
 84 |         loss = None
 85 |         if labels is not None:
 86 |             labels = torch.roll(labels, shifts=-1)
 87 |             labels[:, -1] = -100
 88 |             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
 89 |         return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
 90 | 
 91 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
 92 |         if inputs_embeds is not None:
 93 |             raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
 94 |         attention_mask = kwargs['attention_mask'].bool()
 95 |         if attention_mask[:, -1].sum() != attention_mask.shape[0]:
 96 |             raise NotImplementedError('MPT does not support generation with right padding.')
 97 |         if self.transformer.attn_uses_sequence_id and self.training:
 98 |             sequence_id = torch.zeros_like(input_ids[:1])
 99 |         else:
100 |             sequence_id = None
101 |         if past_key_values is not None:
102 |             input_ids = input_ids[:, -1].unsqueeze(-1)
103 |         if self.transformer.prefix_lm:
104 |             prefix_mask = torch.ones_like(attention_mask)
105 |             if kwargs.get('use_cache') == False:
106 |                 raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
107 |         else:
108 |             prefix_mask = None
109 |         return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)}
110 | 
111 | 
112 | AutoConfig.register("llava_mpt", LlavaMPTConfig)
113 | AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)
114 | 


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 4 | NUM_SENTINEL_TOKENS: int = 100
 5 | 
 6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
 7 |     """Adds sentinel tokens and padding token (if missing).
 8 | 
 9 |     Expands the tokenizer vocabulary to include sentinel tokens
10 |     used in mixture-of-denoiser tasks as well as a padding token.
11 | 
12 |     All added tokens are added as special tokens. No tokens are
13 |     added if sentinel tokens and padding token already exist.
14 |     """
15 |     sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
16 |     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
17 |     if tokenizer.pad_token is None:
18 |         tokenizer.add_tokens('<pad>', special_tokens=True)
19 |         tokenizer.pad_token = '<pad>'
20 |         assert tokenizer.pad_token_id is not None
21 |     sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
22 |     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
23 |     tokenizer.sentinel_token_ids = _sentinel_token_ids
24 | 
25 | class AutoTokenizerForMOD(AutoTokenizer):
26 |     """AutoTokenizer + Adaptation for MOD.
27 | 
28 |     A simple wrapper around AutoTokenizer to make instantiating
29 |     an MOD-adapted tokenizer a bit easier.
30 | 
31 |     MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
32 |     a padding token, and a property to get the token ids of the
33 |     sentinel tokens.
34 |     """
35 | 
36 |     @classmethod
37 |     def from_pretrained(cls, *args, **kwargs):
38 |         """See `AutoTokenizer.from_pretrained` docstring."""
39 |         tokenizer = super().from_pretrained(*args, **kwargs)
40 |         adapt_tokenizer_for_denoising(tokenizer)
41 |         return tokenizer


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/blocks.py:
--------------------------------------------------------------------------------
 1 | """GPT Blocks used for the GPT Model."""
 2 | from typing import Dict, Optional, Tuple
 3 | import torch
 4 | import torch.nn as nn
 5 | from .attention import ATTN_CLASS_REGISTRY
 6 | from .norm import NORM_CLASS_REGISTRY
 7 | 
 8 | class MPTMLP(nn.Module):
 9 | 
10 |     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
11 |         super().__init__()
12 |         self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13 |         self.act = nn.GELU(approximate='none')
14 |         self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
15 |         self.down_proj._is_residual = True
16 | 
17 |     def forward(self, x):
18 |         return self.down_proj(self.act(self.up_proj(x)))
19 | 
20 | class MPTBlock(nn.Module):
21 | 
22 |     def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
23 |         del kwargs
24 |         super().__init__()
25 |         norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
26 |         attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
27 |         self.norm_1 = norm_class(d_model, device=device)
28 |         self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
29 |         self.norm_2 = norm_class(d_model, device=device)
30 |         self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
31 |         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
32 |         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
33 | 
34 |     def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
35 |         a = self.norm_1(x)
36 |         (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
37 |         x = x + self.resid_attn_dropout(b)
38 |         m = self.norm_2(x)
39 |         n = self.ffn(m)
40 |         x = x + self.resid_ffn_dropout(n)
41 |         return (x, attn_weights, past_key_value)


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/configuration_mpt.py:
--------------------------------------------------------------------------------
  1 | """A HuggingFace-style model configuration."""
  2 | from typing import Dict, Optional, Union
  3 | from transformers import PretrainedConfig
  4 | attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
  5 | init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
  6 | 
  7 | class MPTConfig(PretrainedConfig):
  8 |     model_type = 'mpt'
  9 | 
 10 |     def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
 11 |         """The MPT configuration class.
 12 | 
 13 |         Args:
 14 |             d_model (int): The size of the embedding dimension of the model.
 15 |             n_heads (int): The number of attention heads.
 16 |             n_layers (int): The number of layers in the model.
 17 |             expansion_ratio (int): The ratio of the up/down scale in the MLP.
 18 |             max_seq_len (int): The maximum sequence length of the model.
 19 |             vocab_size (int): The size of the vocabulary.
 20 |             resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
 21 |             emb_pdrop (float): The dropout probability for the embedding layer.
 22 |             learned_pos_emb (bool): Whether to use learned positional embeddings
 23 |             attn_config (Dict):  A dictionary used to configure the model's attention module:
 24 |                 attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
 25 |                 attn_pdrop (float): The dropout probability for the attention layers.
 26 |                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
 27 |                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
 28 |                 clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
 29 |                     this value.
 30 |                 softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
 31 |                     use the default scale of ``1/sqrt(d_keys)``.
 32 |                 prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
 33 |                     extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
 34 |                     can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
 35 |                 attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
 36 |                     When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
 37 |                     which sub-sequence each token belongs to.
 38 |                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
 39 |                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
 40 |                 alibi_bias_max (int): The maximum value of the alibi bias.
 41 |             init_device (str): The device to use for parameter initialization.
 42 |             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
 43 |             no_bias (bool): Whether to use bias in all layers.
 44 |             verbose (int): The verbosity level. 0 is silent.
 45 |             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
 46 |             norm_type (str): choose type of norm to use
 47 |             multiquery_attention (bool): Whether to use multiquery attention implementation.
 48 |             use_cache (bool): Whether or not the model should return the last key/values attentions
 49 |             init_config (Dict): A dictionary used to configure the model initialization:
 50 |                 init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
 51 |                     'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
 52 |                     'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
 53 |                 init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
 54 |                 emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
 55 |                 emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
 56 |                     used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
 57 |                 init_std (float): The standard deviation of the normal distribution used to initialize the model,
 58 |                     if using the baseline_ parameter initialization scheme.
 59 |                 init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
 60 |                 fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
 61 |                 init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
 62 |                 ---
 63 |                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
 64 |         """
 65 |         self.d_model = d_model
 66 |         self.n_heads = n_heads
 67 |         self.n_layers = n_layers
 68 |         self.expansion_ratio = expansion_ratio
 69 |         self.max_seq_len = max_seq_len
 70 |         self.vocab_size = vocab_size
 71 |         self.resid_pdrop = resid_pdrop
 72 |         self.emb_pdrop = emb_pdrop
 73 |         self.learned_pos_emb = learned_pos_emb
 74 |         self.attn_config = attn_config
 75 |         self.init_device = init_device
 76 |         self.logit_scale = logit_scale
 77 |         self.no_bias = no_bias
 78 |         self.verbose = verbose
 79 |         self.embedding_fraction = embedding_fraction
 80 |         self.norm_type = norm_type
 81 |         self.use_cache = use_cache
 82 |         self.init_config = init_config
 83 |         if 'name' in kwargs:
 84 |             del kwargs['name']
 85 |         if 'loss_fn' in kwargs:
 86 |             del kwargs['loss_fn']
 87 |         super().__init__(**kwargs)
 88 |         self._validate_config()
 89 | 
 90 |     def _set_config_defaults(self, config, config_defaults):
 91 |         for (k, v) in config_defaults.items():
 92 |             if k not in config:
 93 |                 config[k] = v
 94 |         return config
 95 | 
 96 |     def _validate_config(self):
 97 |         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
 98 |         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
 99 |         if self.d_model % self.n_heads != 0:
100 |             raise ValueError('d_model must be divisible by n_heads')
101 |         if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
102 |             raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
103 |         if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
104 |             raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
105 |         if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
106 |             raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
107 |         if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
108 |             raise NotImplementedError('alibi only implemented with torch and triton attention.')
109 |         if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
110 |             raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
111 |         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
112 |             raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
113 |         if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
114 |             raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
115 |         if self.init_config.get('name', None) is None:
116 |             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
117 |         if not self.learned_pos_emb and (not self.attn_config['alibi']):
118 |             raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/meta_init_context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | @contextmanager
 6 | def init_empty_weights(include_buffers: bool=False):
 7 |     """Meta initialization context manager.
 8 | 
 9 |     A context manager under which models are initialized with all parameters
10 |     on the meta device, therefore creating an empty model. Useful when just
11 |     initializing the model would blow the available RAM.
12 | 
13 |     Args:
14 |         include_buffers (`bool`, *optional*, defaults to `False`): Whether or
15 |             not to also put all buffers on the meta device while initializing.
16 | 
17 |     Example:
18 |     ```python
19 |     import torch.nn as nn
20 | 
21 |     # Initialize a model with 100 billions parameters in no time and without using any RAM.
22 |     with init_empty_weights():
23 |         tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
24 |     ```
25 | 
26 |     <Tip warning={true}>
27 | 
28 |     Any model created under this context manager has no weights. As such you can't do something like
29 |     `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
30 | 
31 |     </Tip>
32 |     """
33 |     with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
34 |         yield f
35 | 
36 | @contextmanager
37 | def init_on_device(device: torch.device, include_buffers: bool=False):
38 |     """Device initialization context manager.
39 | 
40 |     A context manager under which models are initialized with all parameters
41 |     on the specified device.
42 | 
43 |     Args:
44 |         device (`torch.device`): Device to initialize all parameters on.
45 |         include_buffers (`bool`, *optional*, defaults to `False`): Whether or
46 |             not to also put all buffers on the meta device while initializing.
47 | 
48 |     Example:
49 |     ```python
50 |     import torch.nn as nn
51 | 
52 |     with init_on_device(device=torch.device("cuda")):
53 |         tst = nn.Liner(100, 100)  # on `cuda` device
54 |     ```
55 |     """
56 |     old_register_parameter = nn.Module.register_parameter
57 |     if include_buffers:
58 |         old_register_buffer = nn.Module.register_buffer
59 | 
60 |     def register_empty_parameter(module, name, param):
61 |         old_register_parameter(module, name, param)
62 |         if param is not None:
63 |             param_cls = type(module._parameters[name])
64 |             kwargs = module._parameters[name].__dict__
65 |             module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
66 | 
67 |     def register_empty_buffer(module, name, buffer):
68 |         old_register_buffer(module, name, buffer)
69 |         if buffer is not None:
70 |             module._buffers[name] = module._buffers[name].to(device)
71 |     if include_buffers:
72 |         tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
73 |     else:
74 |         tensor_constructors_to_patch = {}
75 | 
76 |     def patch_tensor_constructor(fn):
77 | 
78 |         def wrapper(*args, **kwargs):
79 |             kwargs['device'] = device
80 |             return fn(*args, **kwargs)
81 |         return wrapper
82 |     try:
83 |         nn.Module.register_parameter = register_empty_parameter
84 |         if include_buffers:
85 |             nn.Module.register_buffer = register_empty_buffer
86 |         for torch_function_name in tensor_constructors_to_patch.keys():
87 |             setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
88 |         yield
89 |     finally:
90 |         nn.Module.register_parameter = old_register_parameter
91 |         if include_buffers:
92 |             nn.Module.register_buffer = old_register_buffer
93 |         for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
94 |             setattr(torch, torch_function_name, old_torch_function)


--------------------------------------------------------------------------------
/llava/model/language_model/mpt/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def _cast_if_autocast_enabled(tensor):
 4 |     if torch.is_autocast_enabled():
 5 |         if tensor.device.type == 'cuda':
 6 |             dtype = torch.get_autocast_gpu_dtype()
 7 |         elif tensor.device.type == 'cpu':
 8 |             dtype = torch.get_autocast_cpu_dtype()
 9 |         else:
10 |             raise NotImplementedError()
11 |         return tensor.to(dtype=dtype)
12 |     return tensor
13 | 
14 | class LPLayerNorm(torch.nn.LayerNorm):
15 | 
16 |     def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
17 |         super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
18 | 
19 |     def forward(self, x):
20 |         module_device = x.device
21 |         downcast_x = _cast_if_autocast_enabled(x)
22 |         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
23 |         downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
24 |         with torch.autocast(enabled=False, device_type=module_device.type):
25 |             return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
26 | 
27 | def rms_norm(x, weight=None, eps=1e-05):
28 |     output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
29 |     if weight is not None:
30 |         return output * weight
31 |     return output
32 | 
33 | class RMSNorm(torch.nn.Module):
34 | 
35 |     def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
36 |         super().__init__()
37 |         self.eps = eps
38 |         if weight:
39 |             self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
40 |         else:
41 |             self.register_parameter('weight', None)
42 | 
43 |     def forward(self, x):
44 |         return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
45 | 
46 | class LPRMSNorm(RMSNorm):
47 | 
48 |     def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
49 |         super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
50 | 
51 |     def forward(self, x):
52 |         downcast_x = _cast_if_autocast_enabled(x)
53 |         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
54 |         with torch.autocast(enabled=False, device_type=x.device.type):
55 |             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 |         self.is_loaded = False
11 | 
12 |         self.vision_tower_name = vision_tower
13 |         self.select_layer = args.mm_vision_select_layer
14 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
15 | 
16 |         if not delay_load:
17 |             self.load_model()
18 |         else:
19 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
20 | 
21 |     def load_model(self):
22 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
23 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
24 |         self.vision_tower.requires_grad_(False)
25 | 
26 |         self.is_loaded = True
27 | 
28 |     def feature_select(self, image_forward_outs, layers=[12,16,22,23]):
29 |         image_feature_list = []
30 |         for l in layers:
31 |             image_feature_list.append(image_forward_outs.hidden_states[l])
32 |         image_features_multi = torch.cat(image_feature_list, dim=2)
33 | 
34 |         image_features = image_forward_outs.hidden_states[self.select_layer]
35 | 
36 |         if self.select_feature == 'patch':
37 |             image_features = image_features[:, 1:]
38 |             image_features_multi = image_features_multi[:, 1:]
39 | 
40 |         elif self.select_feature == 'cls_patch':
41 |             image_features = image_features
42 |         else:
43 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
44 |         return image_features, image_features_multi
45 | 
46 |     @torch.no_grad()
47 |     def forward(self, images):
48 | 
49 |         if type(images) is list:
50 |             image_features = []
51 |             for image in images:
52 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
53 |                 image_feature, image_feature_multi = self.feature_select(image_forward_out)
54 | 
55 |                 image_features.append(image_feature.to(image.dtype))
56 |                 image_features_multi.append(image_feature_multi.to(image.dtype))
57 | 
58 |         else:
59 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
60 |             image_features, image_features_multi = self.feature_select(image_forward_outs)
61 | 
62 |         return (image_features.to(images.dtype), image_features_multi.to(images.dtype))
63 | 
64 |     @property
65 |     def dummy_feature(self):
66 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
67 | 
68 |     @property
69 |     def dtype(self):
70 |         return self.vision_tower.dtype
71 | 
72 |     @property
73 |     def device(self):
74 |         return self.vision_tower.device
75 | 
76 |     @property
77 |     def config(self):
78 |         if self.is_loaded:
79 |             return self.vision_tower.config
80 |         else:
81 |             return self.cfg_only
82 | 
83 |     @property
84 |     def hidden_size(self):
85 |         return self.config.hidden_size
86 | 
87 |     @property
88 |     def num_patches(self):
89 |         return (self.config.image_size // self.config.patch_size) ** 2
90 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import re
  4 | from functools import partial
  5 | import numpy as np
  6 | from torch.nn.init import trunc_normal_
  7 | from torch.nn import functional as F
  8 | import math
  9 | 
 10 | 
 11 | class IdentityMap(nn.Module):
 12 |     def __init__(self):
 13 |         super().__init__()
 14 | 
 15 |     def forward(self, x, *args, **kwargs):
 16 |         return x
 17 | 
 18 |     @property
 19 |     def config(self):
 20 |         return {"mm_projector_type": 'identity'}
 21 | 
 22 | 
 23 | class SimpleResBlock(nn.Module):
 24 |     def __init__(self, channels):
 25 |         super().__init__()
 26 |         self.pre_norm = nn.LayerNorm(channels)
 27 | 
 28 |         self.proj = nn.Sequential(
 29 |             nn.Linear(channels, channels),
 30 |             nn.GELU(),
 31 |             nn.Linear(channels, channels)
 32 |         )
 33 |     def forward(self, x):
 34 |         x = self.pre_norm(x)
 35 |         return x + self.proj(x)
 36 | 
 37 | 
 38 | 
 39 | class TokenPacker(nn.Module):
 40 |     def __init__(
 41 |             self,
 42 |             raw_grid=24,
 43 |             embed_dim=1024,
 44 |             num_heads=1024//128,
 45 |             kv_dim=1024,
 46 |             hidden_size=4096,
 47 |             scale_factor=2,
 48 |             norm_layer=partial(nn.LayerNorm, eps=1e-6)
 49 |     ):
 50 |         super().__init__()
 51 |         if raw_grid%scale_factor!=0:
 52 |             raise ValueError("scale_factor must be divisible by grid size")
 53 |         self.raw_grid = raw_grid
 54 |         self.grid_size = raw_grid//scale_factor
 55 |         self.num_queries = self.grid_size ** 2
 56 |         self.embed_dim = embed_dim
 57 |         self.num_heads = num_heads
 58 |         self.scale_factor = scale_factor
 59 |         self.q_proj_1 = nn.Linear(kv_dim, embed_dim, bias=False)
 60 | 
 61 |         k_modules = [nn.Linear(4096, 1024)]
 62 |         for _ in range(1,2):
 63 |             k_modules.append(nn.GELU())
 64 |             k_modules.append(nn.Linear(1024, 1024))
 65 |         self.k_proj_1 = nn.Sequential(*k_modules)
 66 | 
 67 |         v_modules = [nn.Linear(4096, 1024)]
 68 |         for _ in range(1,2):
 69 |             v_modules.append(nn.GELU())
 70 |             v_modules.append(nn.Linear(1024, 1024))
 71 |         self.v_proj_1 = nn.Sequential(*v_modules)
 72 | 
 73 |         self.ln_q_1 = norm_layer(embed_dim)
 74 |         self.ln_k_1 = norm_layer(embed_dim)
 75 |         self.ln_v_1 = norm_layer(embed_dim)
 76 | 
 77 |         self.clip_attn = nn.MultiheadAttention(embed_dim, num_heads)
 78 | 
 79 |         modules = [nn.Linear(1024, hidden_size)]
 80 |         for _ in range(1, 2):
 81 |             modules.append(nn.GELU())
 82 |             modules.append(nn.Linear(hidden_size, hidden_size))
 83 |         self.mlp = nn.Sequential(*modules)
 84 | 
 85 |         self.apply(self._init_weights)
 86 | 
 87 |     def _init_weights(self, m):
 88 |         if isinstance(m, nn.Linear):
 89 |             trunc_normal_(m.weight, std=.02)
 90 |             if isinstance(m, nn.Linear) and m.bias is not None:
 91 |                 nn.init.constant_(m.bias, 0)
 92 |         elif isinstance(m, nn.LayerNorm):
 93 |             nn.init.constant_(m.bias, 0)
 94 |             nn.init.constant_(m.weight, 1.0)
 95 | 
 96 |     def divide_feature(self, x, kernel_size, token_num, N, c):
 97 |         h = w = int(token_num**0.5)
 98 | 
 99 |         reshape_x = x.reshape(h, w, N, c).reshape(h//kernel_size, kernel_size, w, N, c)
100 |         reshape_x = reshape_x.permute(0,2,1,3,4)
101 |         reshape_x = reshape_x.reshape(h//kernel_size, w//kernel_size, kernel_size, kernel_size, N, c)
102 |         reshape_x = reshape_x.permute(0,1,3,2,4,5).reshape(h//kernel_size, w//kernel_size, kernel_size*kernel_size, N, c)
103 |         reshape_x = reshape_x.permute(2,0,1,3,4).reshape(kernel_size*kernel_size, -1, c)
104 | 
105 |         return reshape_x
106 | 
107 |     def forward(self, x, attn_mask=None):
108 | 
109 |         x_multi = x[1] # mulit-level
110 |         x = x[0] # original single-level
111 | 
112 |         key = self.ln_k_1(self.k_proj_1(x_multi)).permute(1, 0, 2)
113 |         value = self.ln_v_1(self.v_proj_1(x_multi)).permute(1, 0, 2)
114 | 
115 |         token_num, N, c = key.shape
116 | 
117 |         q = F.interpolate(x.reshape(x.shape[0],self.raw_grid,self.raw_grid,-1).float().permute(0,3,1,2), size=(self.grid_size, self.grid_size), mode='bilinear').permute(0,2,3,1) ## fix
118 |         q = q.reshape(q.shape[0], -1, q.shape[-1]).to(x.dtype)
119 | 
120 |         query = self.ln_q_1(self.q_proj_1(q)).permute(1, 0, 2)
121 | 
122 |         reshape_query = self.divide_feature(query, 1, self.num_queries, N, c)
123 |         reshape_key = self.divide_feature(key, self.scale_factor, token_num, N, c)
124 |         reshape_value = self.divide_feature(value, self.scale_factor, token_num, N, value.shape[-1])
125 | 
126 |         out = self.clip_attn(
127 |             reshape_query,
128 |             reshape_key,
129 |             reshape_value,
130 |             attn_mask=attn_mask)[0]
131 | 
132 |         x = out
133 |         x = x.reshape(self.num_queries, N, -1)
134 |         x = x.permute(1, 0, 2)
135 | 
136 |         x = self.mlp(x)
137 |         return x
138 | 
139 |     def _repeat(self, query, N: int):
140 |         return query.unsqueeze(1).repeat(1, N, 1)
141 | 
142 | 
143 | 
144 | def build_vision_projector(config):
145 |     return TokenPacker(hidden_size=config.hidden_size, scale_factor=config.scale_factor)
146 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/patch_divide.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchvision.ops.boxes import box_area
  3 | 
  4 | patches_9=[
  5 |     (1,1),
  6 |     (1,2),(2,1),
  7 |     (1,3),(3,1),
  8 |     (2,2),(1,4),(4,1),
  9 |     (1,5),(5,1),
 10 |     (1,6),(6,1),(2,3),(3,2),
 11 |     (1,7),(7,1),
 12 |     (4,2),(2,4),(1,8),(8,1),
 13 |     (3,3),(1,9),(9,1)
 14 | ]
 15 | 
 16 | patches_16=[
 17 |     (1,1),
 18 |     (1,2),(2,1),
 19 |     (1,3),(3,1),
 20 |     (2,2),(1,4),(4,1),
 21 |     (1,5),(5,1),
 22 |     (1,6),(6,1),(2,3),(3,2),
 23 |     (1,7),(7,1),
 24 |     (4,2),(2,4),(1,8),(8,1),
 25 |     (3,3),(1,9),(9,1),
 26 |     (2,5),(5,2), 
 27 |     (2,6),(6,2),(3,4), (4,3),
 28 |     (2,7),(7,2),
 29 |     (3,5),(5,3),
 30 |     (2,8),(8,2),(4,4)
 31 | ]
 32 | 
 33 | patches_25=[
 34 |     (1,1),
 35 |     (1,2),(2,1),
 36 |     (1,3),(3,1),
 37 |     (2,2),(1,4),(4,1),
 38 |     (1,5),(5,1),
 39 |     (1,6),(6,1),(2,3),(3,2),
 40 |     (1,7),(7,1),
 41 |     (4,2),(2,4),(1,8),(8,1),
 42 |     (3,3),(1,9),(9,1),
 43 |     (2,5),(5,2), 
 44 |     (2,6),(6,2),(3,4), (4,3),
 45 |     (2,7),(7,2),
 46 |     (3,5),(5,3),
 47 |     (2,8),(8,2),(4,4),
 48 |     (3,6),(6,3),(2,9),(9,2),
 49 |     (4,5),(5,4),(2,10),(10,2),
 50 |     (3,7),(7,3),
 51 |     (11,2),(2,11),
 52 |     (4,6),(6,4),(12,2),(2,12),(3,8),(8,3),(4,6),(6,4),
 53 |     (5,5)
 54 | ]
 55 | 
 56 | 
 57 | def box_iou(boxes1, area1, boxes2, eps=1e-5):
 58 |     area2 = box_area(boxes2)
 59 | 
 60 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 61 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 62 | 
 63 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 64 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 65 | 
 66 |     union = area1[:, None] + area2 - inter
 67 | 
 68 |     iou = inter / (union+eps)
 69 |     return iou, union
 70 | 
 71 | class Image_Patch:
 72 |     def __init__(self, image_size=336, patch_num=9):
 73 |         if patch_num == 9:
 74 |             patches = patches_9
 75 |         elif patch_num == 16:
 76 |             patches = patches_16
 77 |         elif patch_num == 25:
 78 |             patches = patches_25
 79 |         else:
 80 |             raise(NotImplementedError)
 81 |   
 82 |         # h,w
 83 |         if isinstance(image_size, int):
 84 |             image_size = (image_size, image_size)
 85 |         self.image_size = image_size
 86 | 
 87 |         self.patch_list = patches
 88 | 
 89 |         self.patches = torch.tensor(
 90 |             [[0, 0, _[0]*image_size[0], _[1]*image_size[1]] 
 91 |             for _ in patches], requires_grad=False
 92 |         )
 93 |         
 94 |         self.patch_areas = box_area(self.patches)
 95 |                 
 96 |     def calculate(self, h, w):
 97 |         input_box = torch.tensor([0, 0, h, w]).unsqueeze(0)
 98 |         ratio = self.patches[:, 2:]/input_box[:, 2:]
 99 |         ratio = ratio.min(dim=-1)[0]
100 |         score = torch.round(h*ratio) * torch.round(w*ratio) / self.patch_areas
101 |         iou, _ = box_iou(self.patches, self.patch_areas, input_box*1.4)
102 |         iou = iou[:, 0]
103 |         score = score + iou*0.1
104 |         idx = torch.argmax(score)
105 |         return self.patch_list[idx]


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | 
  5 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  6 | from llava.conversation import conv_templates, SeparatorStyle
  7 | from llava.model.builder import load_pretrained_model
  8 | from llava.utils import disable_torch_init
  9 | from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 10 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 11 | from llava.model import *
 12 | import torch.nn.functional as F
 13 | from functools import partial
 14 | from llava.patch_divide import Image_Patch
 15 | from torchvision.transforms import Compose, ToTensor, Normalize
 16 | 
 17 | from PIL import Image
 18 | 
 19 | import requests
 20 | from PIL import Image
 21 | from io import BytesIO
 22 | from transformers import TextStreamer
 23 | from functools import partial
 24 | import time
 25 | 
 26 | def main(args):
 27 |     # Model
 28 |     disable_torch_init()
 29 |     model_path = os.path.expanduser(args.model_path)
 30 |     model_name = get_model_name_from_path(model_path)
 31 |     tokenizer = AutoTokenizer.from_pretrained(
 32 |         args.model_path,
 33 |         model_max_length = 2048,
 34 |         padding_side="right",
 35 |         use_fast = True
 36 |     )
 37 |     model = LlavaLlamaForCausalLM.from_pretrained(
 38 |         args.model_path,   
 39 |         torch_dtype=torch.bfloat16,
 40 |     ).cuda()
 41 | 
 42 |     for m in model.modules():
 43 |         m.tokenizer = tokenizer
 44 | 
 45 |     vision_tower = model.get_vision_tower()
 46 |     if not vision_tower.is_loaded:
 47 |         vision_tower.load_model()
 48 |     vision_tower.to(device='cuda', dtype=torch.float16)
 49 |     image_processor = vision_tower.image_processor
 50 | 
 51 |     patch_num = getattr(model.config, 'patch_num', '9')
 52 |     image_patch = Image_Patch(int(patch_num))
 53 |     preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
 54 | 
 55 |     
 56 |     while True:
 57 |         conv = conv_templates[args.conv_mode].copy()
 58 |         if "mpt" in model_name.lower():
 59 |             roles = ('user', 'assistant')
 60 |         else:
 61 |             roles = conv.roles
 62 | 
 63 |         image_file = input("image file: ")
 64 | 
 65 |         image = Image.open(image_file).convert('RGB')
 66 | 
 67 |         if model.config.image_aspect_ratio == 'slice':
 68 |             image = preprocess(image)
 69 |             image = image.unsqueeze(0)
 70 |             h, w = image.shape[-2:]
 71 |             block_size = 336
 72 |             h_block, w_block = image_patch.calculate(h, w)
 73 |             h_ratio = block_size*h_block/h
 74 |             w_ratio = block_size*w_block/w
 75 |             if h_ratio<=w_ratio:
 76 |                 w_ = min(block_size*w_block, round(w*h_ratio))
 77 |                 h_ = block_size*h_block
 78 |             else:
 79 |                 w_ = block_size*w_block
 80 |                 h_ = min(block_size*h_block, round(h*w_ratio))
 81 |             image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
 82 |             image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
 83 |             image[:, :, :h_, :w_] = image_inter
 84 | 
 85 |             split_images = []
 86 |             for i_ in range(h_block):
 87 |                 for j_ in range(w_block):
 88 |                     image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
 89 |                     split_images.append(image_s)
 90 |             if len(split_images)>1:
 91 |                 h_ratio = block_size/h
 92 |                 w_ratio = block_size/w
 93 |                 if h_ratio<=w_ratio:
 94 |                     w_ = min(block_size, round(w*h_ratio))
 95 |                     h_ = block_size
 96 |                 else:
 97 |                     w_ = block_size
 98 |                     h_ = min(block_size, round(h*w_ratio))
 99 |                 image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
100 |                 image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
101 |                 image_s[:, :, :h_, :w_] = image_inter
102 |                 split_images.append(image_s)
103 |             image_tensor = torch.cat(split_images, dim=0)
104 |         else:
105 |             image_tensor = process_images([image], image_processor, model.config)[0]
106 |             image_tensor = image_tensor.unsqueeze(0)
107 |             h_block = 1
108 |             w_block = 1
109 | 
110 |         try:
111 |             inp = input(f"{roles[0]}: ")
112 |         except EOFError:
113 |             inp = ""
114 |         if not inp:
115 |             print("exit...")
116 |             break
117 |         # inp = "what is in the image?"
118 | 
119 |         print(f"{roles[1]}: ", end="")
120 | 
121 |         if image is not None:
122 |             if model.config.mm_use_im_start_end:
123 |                 inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
124 |             else:
125 |                 inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
126 |             conv.append_message(conv.roles[0], inp)
127 |             image = None
128 |         else:
129 |             # later messages
130 |             conv.append_message(conv.roles[0], inp)
131 |         conv.append_message(conv.roles[1], None)
132 |         prompt = conv.get_prompt()
133 | 
134 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
135 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
136 |         keywords = [stop_str]
137 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
138 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
139 | 
140 |         mode = model.config.image_aspect_ratio
141 |         with torch.inference_mode():
142 |             model.orig_forward = model.forward
143 |             model.forward = partial(model.orig_forward,
144 |                                     mode=mode,
145 |                                     h_block=h_block,
146 |                                     w_block=w_block)
147 |             start = time.time()
148 | 
149 |             output_ids = model.generate(
150 |                 input_ids,
151 |                 images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
152 |                 do_sample=True,
153 |                 temperature=args.temperature,
154 |                 max_new_tokens=args.max_new_tokens,
155 |                 streamer=streamer,
156 |                 use_cache=True,
157 |                 stopping_criteria=[stopping_criteria])
158 |             model.forward = model.orig_forward
159 | 
160 |         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
161 |         end = time.time()
162 |         print("***time: ", end-start)
163 |         conv.messages[-1][-1] = outputs
164 | 
165 |         if args.debug:
166 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     parser = argparse.ArgumentParser()
171 |     parser.add_argument("--model-path", type=str, default="path/to/tokenpacker")
172 |     parser.add_argument("--device", type=str, default="cuda")
173 |     parser.add_argument("--conv-mode", type=str, default='vicuna_v1')
174 |     parser.add_argument("--temperature", type=float, default=0.2)
175 |     parser.add_argument("--max-new-tokens", type=int, default=512)
176 |     parser.add_argument("--load-8bit", action="store_true")
177 |     parser.add_argument("--load-4bit", action="store_true")
178 |     parser.add_argument("--debug", action="store_true")
179 |     args = parser.parse_args()
180 |     main(args)
181 | 


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | import warnings
  3 | 
  4 | import torch
  5 | 
  6 | import transformers
  7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
  8 | 
  9 | try:
 10 |     from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 11 | except ImportError:
 12 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 13 | from flash_attn.bert_padding import unpad_input, pad_input
 14 | 
 15 | 
 16 | def forward(
 17 |     self,
 18 |     hidden_states: torch.Tensor,
 19 |     attention_mask: Optional[torch.Tensor] = None,
 20 |     position_ids: Optional[torch.Tensor] = None,
 21 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 22 |     output_attentions: bool = False,
 23 |     use_cache: bool = False,
 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 25 |     if output_attentions:
 26 |         warnings.warn(
 27 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 28 |         )
 29 | 
 30 |     bsz, q_len, _ = hidden_states.size()
 31 | 
 32 |     query_states = (
 33 |         self.q_proj(hidden_states)
 34 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 35 |         .transpose(1, 2)
 36 |     )
 37 |     key_states = (
 38 |         self.k_proj(hidden_states)
 39 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 40 |         .transpose(1, 2)
 41 |     )
 42 |     value_states = (
 43 |         self.v_proj(hidden_states)
 44 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 45 |         .transpose(1, 2)
 46 |     )  # shape: (b, num_heads, s, head_dim)
 47 | 
 48 |     kv_seq_len = key_states.shape[-2]
 49 |     if past_key_value is not None:
 50 |         kv_seq_len += past_key_value[0].shape[-2]
 51 | 
 52 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 53 |     query_states, key_states = apply_rotary_pos_emb(
 54 |         query_states, key_states, cos, sin, position_ids
 55 |     )
 56 | 
 57 |     if past_key_value is not None:
 58 |         # reuse k, v
 59 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 60 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 61 | 
 62 |     past_key_value = (key_states, value_states) if use_cache else None
 63 | 
 64 |     # repeat k/v heads if n_kv_heads < n_heads
 65 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
 66 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
 67 | 
 68 |     # Transform the data into the format required by flash attention
 69 |     qkv = torch.stack([query_states, key_states, value_states], dim=2)
 70 |     qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
 71 |     key_padding_mask = attention_mask
 72 | 
 73 |     if key_padding_mask is None:
 74 |         qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
 75 |         cu_q_lens = torch.arange(
 76 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 77 |         )
 78 |         max_s = q_len
 79 |         output = flash_attn_unpadded_qkvpacked_func(
 80 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 81 |         )
 82 |         output = output.view(bsz, q_len, -1)
 83 |     else:
 84 |         qkv = qkv.reshape(bsz, q_len, -1)
 85 |         qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
 86 |         qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
 87 |         output_unpad = flash_attn_unpadded_qkvpacked_func(
 88 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 89 |         )
 90 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 91 |         output = pad_input(output_unpad, indices, bsz, q_len)
 92 | 
 93 |     return self.o_proj(output), None, past_key_value
 94 | 
 95 | 
 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
 97 | # requires the attention mask to be the same as the key_padding_mask
 98 | def _prepare_decoder_attention_mask(
 99 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
100 | ):
101 |     # [bsz, seq_len]
102 |     return attention_mask
103 | 
104 | 
105 | def replace_llama_attn_with_flash_attn():
106 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
107 |     if cuda_major < 8:
108 |         warnings.warn(
109 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
110 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111 |         )
112 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113 |         _prepare_decoder_attention_mask
114 |     )
115 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | replace_llama_attn_with_flash_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/llava/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from llava.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True)
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "llava"
 7 | version = "1.1.3"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0", "transformers==4.31.0",
17 |     "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
18 |     "gradio==3.35.2", "gradio_client==0.2.9",
19 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
20 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
21 | ]
22 | 
23 | [project.optional-dependencies]
24 | train = ["deepspeed==0.9.5", "ninja", "wandb"]
25 | 
26 | [project.urls]
27 | "Homepage" = "https://llava-vl.github.io"
28 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
29 | 
30 | [tool.setuptools.packages.find]
31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
32 | 
33 | [tool.wheel]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 | 


--------------------------------------------------------------------------------
/scripts/convert_docvqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['questionId']
14 |     text = res['answer'].rstrip('.')
15 |     all_answers.append({"questionId": question_id, "answer": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/scripts/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 |     --version $PROMPT_VERSION \
21 |     --data_path ./playground/data/llava_instruct_80k.json \
22 |     --image_folder /path/to/coco/train2017 \
23 |     --vision_tower openai/clip-vit-large-patch14 \
24 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/scripts/finetune_full_schedule.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 |     --version $PROMPT_VERSION \
21 |     --data_path ./playground/data/llava_instruct_158k.json \
22 |     --image_folder /path/to/coco/train2017 \
23 |     --vision_tower openai/clip-vit-large-patch14 \
24 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 |     --num_train_epochs 3 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --lora_enable True \
20 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
21 |     --version $PROMPT_VERSION \
22 |     --data_path ./playground/data/llava_instruct_80k.json \
23 |     --image_folder /path/to/coco/train2017 \
24 |     --vision_tower openai/clip-vit-large-patch14 \
25 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
26 |     --mm_vision_select_layer -2 \
27 |     --mm_use_im_start_end False \
28 |     --mm_use_im_patch_token False \
29 |     --bf16 True \
30 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
31 |     --num_train_epochs 1 \
32 |     --per_device_train_batch_size 16 \
33 |     --per_device_eval_batch_size 4 \
34 |     --gradient_accumulation_steps 1 \
35 |     --evaluation_strategy "no" \
36 |     --save_strategy "steps" \
37 |     --save_steps 50000 \
38 |     --save_total_limit 1 \
39 |     --learning_rate 2e-5 \
40 |     --weight_decay 0. \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --lazy_preprocess True \
48 |     --dataloader_num_workers 4 \
49 |     --report_to wandb
50 | 


--------------------------------------------------------------------------------
/scripts/finetune_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --lora_enable True \
20 |     --bits 4 \
21 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
22 |     --version $PROMPT_VERSION \
23 |     --data_path ./playground/data/llava_instruct_80k.json \
24 |     --image_folder /path/to/coco/train2017 \
25 |     --vision_tower openai/clip-vit-large-patch14 \
26 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
27 |     --mm_vision_select_layer -2 \
28 |     --mm_use_im_start_end False \
29 |     --mm_use_im_patch_token False \
30 |     --bf16 True \
31 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
32 |     --num_train_epochs 1 \
33 |     --per_device_train_batch_size 16 \
34 |     --per_device_eval_batch_size 4 \
35 |     --gradient_accumulation_steps 1 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 50000 \
39 |     --save_total_limit 1 \
40 |     --learning_rate 2e-5 \
41 |     --weight_decay 0. \
42 |     --warmup_ratio 0.03 \
43 |     --lr_scheduler_type "cosine" \
44 |     --logging_steps 1 \
45 |     --tf32 True \
46 |     --model_max_length 2048 \
47 |     --gradient_checkpointing True \
48 |     --lazy_preprocess True \
49 |     --dataloader_num_workers 4 \
50 |     --report_to wandb
51 | 


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | # MODEL_VERSION=vicuna-v1-3-7b
 8 | # MODEL_VERSION=llama-2-7b-chat
 9 | 
10 | ########### DO NOT CHANGE ###########
11 | ########### USE THIS FOR BOTH ###########
12 | PROMPT_VERSION=plain
13 | ########### DO NOT CHANGE ###########
14 | 
15 | deepspeed llava/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
18 |     --version $PROMPT_VERSION \
19 |     --data_path /path/to/pretrain_data.json \
20 |     --image_folder /path/to/images \
21 |     --vision_tower openai/clip-vit-large-patch14 \
22 |     --tune_mm_mlp_adapter True \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --bf16 True \
27 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
28 |     --num_train_epochs 1 \
29 |     --per_device_train_batch_size 16 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 1 \
32 |     --evaluation_strategy "no" \
33 |     --save_strategy "steps" \
34 |     --save_steps 24000 \
35 |     --save_total_limit 1 \
36 |     --learning_rate 2e-3 \
37 |     --weight_decay 0. \
38 |     --warmup_ratio 0.03 \
39 |     --lr_scheduler_type "cosine" \
40 |     --logging_steps 1 \
41 |     --tf32 True \
42 |     --model_max_length 2048 \
43 |     --gradient_checkpointing True \
44 |     --dataloader_num_workers 4 \
45 |     --lazy_preprocess True \
46 |     --report_to wandb
47 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/docvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-tokenpacker-7b"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.eval_docvqa \
12 |         --model-path llava-tokenpacker-7b \
13 |         --question-file ./playground/data/eval/docvqa/data/test_v1.0.json \
14 |         --image-folder /path/to/docvqa/images \
15 |         --answers-file ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |         --num-chunks $CHUNKS \
17 |         --chunk-idx $IDX \
18 |         --temperature 0 \
19 |         --conv-mode vicuna_v1 &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=./playground/data/eval/docvqa/answers/$CKPT/merge.jsonl
25 | 
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 | 
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | python scripts/convert_docvqa_for_eval.py --src $output_file --dst ./playground/data/eval/docvqa/answers/$CKPT/submit.json
35 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-tokenpacker-7b"
 9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 |         --model-path llava-tokenpacker-7b \
15 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
16 |         --image-folder /path/to/gqa/images \
17 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
37 | 
38 | cd $GQADIR
39 | python eval/eval.py --tier testdev_balanced
40 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_20230712"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path llava-tokenpacker-7b \
 7 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
 8 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-tokenpacker-7b.jsonl \
 9 |     --single-pred-prompt \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
14 | 
15 | python scripts/convert_mmbench_for_submission.py \
16 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
17 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
18 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
19 |     --experiment llava-tokenpacker-7b


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NAME=llava-tokenpacker-7b
 3 | 
 4 | python -m llava.eval.model_vqa_loader \
 5 |     --model-path llava-tokenpacker-7b \
 6 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 7 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 8 |     --answers-file ./playground/data/eval/MME/answers/$NAME.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode vicuna_v1
11 | 
12 | cd ./playground/data/eval/MME
13 | 
14 | python convert_answer_to_mme.py --experiment $NAME
15 | 
16 | cd eval_tool
17 | 
18 | python calculation.py --results_dir answers/$NAME
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmmu_val.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="llava-tokenpacker-7b"
10 | CONFIG="llava/eval/mmmu/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python llava/eval/mmmu/eval/run_llava.py \
14 |         --data_path /path/to/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path llava-tokenpacker-7b \
17 |         --answers-file ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "validation" \
21 |         --conv-mode vicuna_v1 & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python llava/eval/mmmu/eval/eval.py --result_file $output_file --output_path ./playground/data/eval/MMMU/$CKPT/val.json
37 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path llava-tokenpacker-7b \
 5 |     --question-file /path/to/llava-mm-vet.jsonl \
 6 |     --image-folder /path/to/mm-vet/images \
 7 |     --answers-file ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | mkdir -p ./playground/data/eval/mm-vet/results
12 | 
13 | python scripts/convert_mmvet_for_eval.py \
14 |     --src ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \
15 |     --dst ./playground/data/eval/mm-vet/results/llava-tokenpacker-7b.json
16 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/ocr_bench.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python -m llava.eval.eval_ocr_bench \
 3 |     --model_path llava-tokenpacker-7b  \
 4 |     --image_folder /path/to/OCR-Bench/OCRBench_Images \
 5 |     --output_folder ./playground/data/eval/ocr_bench \
 6 |     --OCRBench_file /path/to/OCRBench.json \
 7 |     --save_name llava-tokenpacker-7b \
 8 |     --temperature 0 \
 9 |     --conv_mode vicuna_v1
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | NAME="llava-tokenpacker-7b"
 5 | 
 6 | python -m llava.eval.model_vqa_loader_pope \
 7 |     --model-path llava-tokenpacker-7b \
 8 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
 9 |     --image-folder /path/tp/coco_imgs \
10 |     --answers-file ./playground/data/eval/pope/answers/$NAME.jsonl \
11 |     --temperature 0 \
12 |     --conv-mode vicuna_v1
13 | 
14 | python llava/eval/eval_pope.py \
15 |     --annotation-dir ./playground/data/eval/pope/coco \
16 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
17 |     --result-file ./playground/data/eval/pope/answers/$NAME.jsonl
18 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.model_vqa_loader \
 4 |     --model-path llava-tokenpacker-7b \
 5 |     --question-file /path/tp/llava_textvqa_val_v051_ocr.jsonl \
 6 |     --image-folder /path/tp/textvqa/train_images \
 7 |     --answers-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.eval_textvqa \
12 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
13 |     --result-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl
14 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path llava-tokenpacker-7b\
 5 |     --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
 6 |     --image-folder /path/to/vizwiz/test \
 7 |     --answers-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python scripts/convert_vizwiz_for_submission.py \
12 |     --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
13 |     --result-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \
14 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-tokenpacker-7b.json
15 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-tokenpacker-7b"
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path llava-tokenpacker-7b \
14 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 |         --image-folder /path/to/VQAv2/test2015/ \
16 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
36 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path vicuna-7b-v1.5 \
 6 |     --version v1 \
 7 |     --data_path /path/to/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./data/llava_mix665k \
 9 |     --vision_tower ./clip-vit-large-patch14-336 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain/mm_projector.bin \
11 |     --mm_projector_type tokenpacker \
12 |     --scale_factor 2 \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --image_aspect_ratio pad \
17 |     --group_by_modality_length True \
18 |     --bf16 True \
19 |     --output_dir ./checkpoints/llava-tokenpacker \
20 |     --num_train_epochs 1 \
21 |     --per_device_train_batch_size 16 \
22 |     --per_device_eval_batch_size 4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --evaluation_strategy "no" \
25 |     --save_strategy "steps" \
26 |     --save_steps 50000 \
27 |     --save_total_limit 1 \
28 |     --learning_rate 2e-5 \
29 |     --weight_decay 0. \
30 |     --warmup_ratio 0.03 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --tf32 True \
34 |     --model_max_length 2048 \
35 |     --gradient_checkpointing True \
36 |     --dataloader_num_workers 4 \
37 |     --lazy_preprocess True \
38 |     --report_to "none"
39 | 
40 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path vicuna-7b-v1.5 \
 6 |     --version v1 \
 7 |     --data_path /path/to/mgm_instruction.json \
 8 |     --image_folder ./data/MGM-Finetune \
 9 |     --vision_tower ./clip-vit-large-patch14-336 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain-hd/mm_projector.bin \
11 |     --mm_projector_type tokenpacker \
12 |     --patch_num 9 \
13 |     --scale_factor 2 \
14 |     --mm_vision_select_layer -2 \
15 |     --mm_use_im_start_end False \
16 |     --mm_use_im_patch_token False \
17 |     --image_aspect_ratio slice \
18 |     --group_by_modality_length True \
19 |     --bf16 True \
20 |     --output_dir ./checkpoints/llava-tokenpacker-hd \
21 |     --num_train_epochs 1 \
22 |     --per_device_train_batch_size 16 \
23 |     --per_device_eval_batch_size 4 \
24 |     --gradient_accumulation_steps 1 \
25 |     --evaluation_strategy "no" \
26 |     --save_strategy "steps" \
27 |     --save_steps 50000 \
28 |     --save_total_limit 1 \
29 |     --learning_rate 2e-5 \
30 |     --weight_decay 0. \
31 |     --warmup_ratio 0.03 \
32 |     --lr_scheduler_type "cosine" \
33 |     --logging_steps 1 \
34 |     --tf32 True \
35 |     --model_max_length 2048 \
36 |     --gradient_checkpointing True \
37 |     --dataloader_num_workers 4 \
38 |     --lazy_preprocess True \
39 |     --report_to "none"
40 | 
41 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path vicuna-7b-v1.5 \
 6 |     --version plain \
 7 |     --data_path /path/to/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./data/llava_pretrain_558k \
 9 |     --vision_tower ./clip-vit-large-patch14-336 \
10 |     --mm_projector_type tokenpacker \
11 |     --scale_factor 2 \
12 |     --tune_mm_mlp_adapter True \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-tokenpacker-pretrain/ \
18 |     --num_train_epochs 1 \
19 |     --per_device_train_batch_size 32 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 1 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 24000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 1e-3 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --report_to "none"
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path vicuna-7b-v1.5 \
 6 |     --version plain \
 7 |     --data_path /path/to/mgm_pretrain.json \
 8 |     --image_folder ./data/llava_pretrain_558k \
 9 |     --vision_tower ./clip-vit-large-patch14-336 \
10 |     --mm_projector_type tokenpacker \
11 |     --patch_num 9 \
12 |     --scale_factor 2 \
13 |     --tune_mm_mlp_adapter True \
14 |     --mm_vision_select_layer -2 \
15 |     --mm_use_im_start_end False \
16 |     --mm_use_im_patch_token False \
17 |     --image_aspect_ratio slice \
18 |     --bf16 True \
19 |     --output_dir ./checkpoints/llava-tokenpacker-pretrain-hd/ \
20 |     --num_train_epochs 1 \
21 |     --per_device_train_batch_size 32 \
22 |     --per_device_eval_batch_size 4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --evaluation_strategy "no" \
25 |     --save_strategy "steps" \
26 |     --save_steps 24000 \
27 |     --save_total_limit 1 \
28 |     --learning_rate 1e-3 \
29 |     --weight_decay 0. \
30 |     --warmup_ratio 0.03 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --tf32 True \
34 |     --model_max_length 2048 \
35 |     --gradient_checkpointing True \
36 |     --dataloader_num_workers 4 \
37 |     --lazy_preprocess True \
38 |     --report_to "none"
39 | 
40 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------