├── README.md
├── assets
├── Algorithm.png
├── compare.png
├── ex1.png
├── ex2.jpg
├── framework.png
├── hd.png
├── high-reso.jpg
├── projector_comparsion.jpg
├── title.png
├── vis-1.jpg
└── vis-2.jpg
├── docs
└── evaluation.md
├── llava
├── __init__.py
├── constants.py
├── conversation.py
├── eval
│ ├── eval_docvqa.py
│ ├── eval_gpt_review.py
│ ├── eval_gpt_review_bench.py
│ ├── eval_gpt_review_visual.py
│ ├── eval_ocr_bench.py
│ ├── eval_pope.py
│ ├── eval_textvqa.py
│ ├── m4c_evaluator.py
│ ├── mmmu
│ │ └── eval
│ │ │ ├── README.md
│ │ │ ├── answer_dict_val.json
│ │ │ ├── configs
│ │ │ └── llava1.5.yaml
│ │ │ ├── convert_to_test.py
│ │ │ ├── eval.py
│ │ │ ├── main_eval_only.py
│ │ │ ├── main_parse_and_eval.py
│ │ │ ├── print_results.py
│ │ │ ├── run_llava.py
│ │ │ └── utils
│ │ │ ├── __pycache__
│ │ │ ├── data_utils.cpython-310.pyc
│ │ │ ├── eval_utils.cpython-310.pyc
│ │ │ └── model_utils_ind.cpython-310.pyc
│ │ │ ├── data_utils.py
│ │ │ ├── eval_utils.py
│ │ │ ├── model_utils.py
│ │ │ └── model_utils_ind.py
│ ├── model_qa.py
│ ├── model_vqa.py
│ ├── model_vqa_loader.py
│ ├── model_vqa_loader_pope.py
│ ├── model_vqa_mmbench.py
│ ├── run_llava.py
│ └── summarize_gpt_review.py
├── mm_utils.py
├── model
│ ├── __init__.py
│ ├── builder.py
│ ├── consolidate.py
│ ├── language_model
│ │ ├── llava_llama.py
│ │ ├── llava_mpt.py
│ │ └── mpt
│ │ │ ├── __pycache__
│ │ │ ├── adapt_tokenizer.cpython-310.pyc
│ │ │ ├── attention.cpython-310.pyc
│ │ │ ├── blocks.cpython-310.pyc
│ │ │ ├── configuration_mpt.cpython-310.pyc
│ │ │ ├── custom_embedding.cpython-310.pyc
│ │ │ ├── flash_attn_triton.cpython-310.pyc
│ │ │ ├── hf_prefixlm_converter.cpython-310.pyc
│ │ │ ├── meta_init_context.cpython-310.pyc
│ │ │ ├── modeling_mpt.cpython-310.pyc
│ │ │ ├── norm.cpython-310.pyc
│ │ │ └── param_init_fns.cpython-310.pyc
│ │ │ ├── adapt_tokenizer.py
│ │ │ ├── attention.py
│ │ │ ├── blocks.py
│ │ │ ├── configuration_mpt.py
│ │ │ ├── custom_embedding.py
│ │ │ ├── flash_attn_triton.py
│ │ │ ├── hf_prefixlm_converter.py
│ │ │ ├── meta_init_context.py
│ │ │ ├── modeling_mpt.py
│ │ │ ├── norm.py
│ │ │ └── param_init_fns.py
│ ├── llava_arch.py
│ ├── multimodal_encoder
│ │ ├── builder.py
│ │ └── clip_encoder.py
│ ├── multimodal_projector
│ │ └── builder.py
│ └── utils.py
├── patch_divide.py
├── serve
│ ├── __init__.py
│ ├── cli.py
│ ├── controller.py
│ ├── examples
│ │ ├── extreme_ironing.jpg
│ │ └── waterview.jpg
│ ├── gradio_web_server.py
│ ├── model_worker.py
│ ├── register_worker.py
│ └── test_message.py
├── train
│ ├── llama_flash_attn_monkey_patch.py
│ ├── llava_trainer.py
│ ├── train.py
│ └── train_mem.py
└── utils.py
├── pyproject.toml
└── scripts
├── convert_docvqa_for_eval.py
├── convert_gqa_for_eval.py
├── convert_mmbench_for_submission.py
├── convert_mmvet_for_eval.py
├── convert_vizwiz_for_submission.py
├── convert_vqav2_for_submission.py
├── extract_mm_projector.py
├── finetune.sh
├── finetune_full_schedule.sh
├── finetune_lora.sh
├── finetune_qlora.sh
├── merge_lora_weights.py
├── pretrain.sh
├── v1_5
├── eval
│ ├── docvqa.sh
│ ├── gqa.sh
│ ├── mmbench.sh
│ ├── mme.sh
│ ├── mmmu_val.sh
│ ├── mmvet.sh
│ ├── ocr_bench.sh
│ ├── pope.sh
│ ├── textvqa.sh
│ ├── vizwiz.sh
│ └── vqav2.sh
├── finetune.sh
├── finetune_hd.sh
├── pretrain.sh
└── pretrain_hd.sh
├── zero2.json
├── zero3.json
└── zero3_offload.json
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
21 |
22 |
23 | ---
24 |
25 | ## Comparisons with existing methods 💡
26 |
27 |
28 |
29 |
30 |
31 | ## Updates 📌
32 | - [2025/5/23] TokenPacker is accepted by **IJCV** 🎉🎉🎉.
33 | - [2024/10/22] We integrated TokenPacker-HD framework with [Osprey](https://github.com/CircleRadon/Osprey) to achieve fine-grained high-resolution pixel-level understanding with large performance gains. Please see the codes in this [branch](https://github.com/CircleRadon/TokenPacker/tree/tokenpacker-hd-osprey) for your reference.
34 | - [2024/7/25] We released [checkpoints](https://huggingface.co/collections/sunshine-lwt/tokenpacker-66a234618f0d2327e0cf2cb1), please check them.
35 | - [2024/7/3] We released the [paper](https://arxiv.org/abs/2407.02392) of our TokenPacker on Arxiv.
36 | - [2024/7/3] We released the training and inference codes.
37 |
38 |
39 | ## What is TokenPacker 👀
40 | TokenPacker is a novel visual projector, which adopts a `coarse-to-fine` scheme
41 | to inject the enriched characteristics to generate the condensed visual tokens. Using TokenPacker, we can compress the
42 | visual tokens by **75%∼89%**, while achieves comparable or even better performance
43 | across diverse benchmarks with significantly higher efficiency.
44 |
45 |
46 | #### Algorithms
47 | We provide the pseudo-codes to showcase the detailed processing flow.
48 |
49 |
50 | #### Core codes
51 | As a visual projector, TokenPacker is implemented by a `class TokenPacker`, which can be found in [multimodal_projector/builder.py](./llava/model/multimodal_projector/builder.py#L39)
52 |
53 | #### Comparisons with various projectors
54 |
55 |
56 |
57 | ## High-Resolution Image Understanding with TokenPacker 🔬
58 | To support efficient `high-resolution` image understanding, we further develop an effective image
59 | cropping method `TokenPacker-HD`.
60 |
61 |
62 |
63 | ## Install 🛠️
64 | 1. Clone this repository and navigate to TokenPacker folder
65 | ```
66 | git clone https://github.com/CircleRadon/TokenPacker.git
67 | cd TokenPacker
68 | ```
69 | 2. Install packages
70 | ```
71 | conda create -n tokenpacker python=3.10 -y
72 | conda activate tokenpacker
73 | pip install --upgrade pip # enable PEP 660 support
74 | pip install -e .
75 | ```
76 | 3. Install additional packages for training cases
77 | ```
78 | pip install -e ".[train]"
79 | pip install flash-attn --no-build-isolation
80 | ```
81 |
82 | ## Training 🚀
83 |
84 | ### LLaVA-TokenPacker
85 |
86 | #### Dataset
87 | To make a fair comparison, we use the same training data as in [LLaVA-1.5](https://github.com/haotian-liu/LLaVA), i.e., [LLaVA-Pretrain-558K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/tree/main) for stage 1, and [Mix665k](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main) for stage 2.
88 |
89 | #### Training
90 | - Stage1: Image-Text Alignment Pre-training
91 | ```shell
92 | bash scripts/v1_5/pretrain.sh
93 | ```
94 | - Stage2: Visual Instruction Tuning
95 | ```shell
96 | bash scripts/v1_5/finetune.sh
97 | ```
98 | Note: Using `--scale_factor` to control compression ratio, support [2,3,4]
99 |
100 | ### LLaVA-TokenPacker-HD
101 |
102 | #### Dataset
103 | To obtain the competitive high-resolution performance, we use 2.7M data as organized by [Mini-Gemini](https://github.com/dvlab-research/MGM#Dataset), i.e., 1.2M for stage 1 and 1.5M for stage 2.
104 |
105 | #### Training
106 | - Stage1: Image-Text Alignment Pre-training
107 | ```shell
108 | bash scripts/v1_5/pretrain_hd.sh
109 | ```
110 | - Stage2: Visual Instruction Tuning
111 | ```shell
112 | bash scripts/v1_5/finetune_hd.sh
113 | ```
114 |
115 | Note:
116 | - Using `--scale_factor` to control compression ratio, support [2,3,4].
117 | - Using `--patch_num` to control max patch dividing number, support [9,16,25].
118 |
119 |
120 | ## Experiments
121 |
122 |
123 |
124 |
125 |
126 |
127 | ## Model Zoo
128 |
129 | | Model | Max Res. | Compre. Ratio | Token Num. | Max Patch Num. | Training Data | Download |
130 | |--------------------|:-----------:|:---------------:|:------------:|:----------------:|:--------------------------------------------------------------------------------------------------:|---------------------------------------------------------------------------------------|
131 | | TokenPacker-7b | 336x336 | 1/4 | 144 | - | 558K+665K | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-7b-144token/tree/main) |
132 | | TokenPacker-13b | 336x336 | 1/4 | 144 | - | 558K+665K | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-13b-144token/tree/main) |
133 | | TokenPacker-HD-7b | 1088x1088 | 1/4 | ~954 | 9 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-7b-9patch-144token/tree/main) |
134 | | TokenPacker-HD-13b | 1088x1088 | 1/4 | ~954 | 9 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-9patch-144token/tree/main) |
135 | | TokenPacker-HD-13b | 1344x1344 | 1/4 | ~1393 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-144token/tree/main) |
136 | | TokenPacker-HD-13b | 1344x1344 | 1/9 | ~619 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-64token/tree/main) |
137 | | TokenPacker-HD-13b | 1344x1344 | 1/16 | ~347 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-36token/tree/main) |
138 |
139 | Note:
140 | - The `token number` of TokenPacker-HD is the `average` statistically across all training and test data.
141 | - The training data of `558K+665K` follows LLaVA-1.5, the one of `1.2M+1.5M` follows Mini-Gemini.
142 | - All LLMs use Vicuna-7b/13b as based LLM.
143 |
144 |
145 | ## Visualization
146 | We provide some visual examples.
147 |
148 |
149 |
150 |
151 | High-resolution image understanding.
152 |
153 |
154 |
155 | ## TODO List 📝
156 | - [x] Release the training and inference codes.
157 | - [x] Release all checkpoints.
158 |
159 |
160 | ## Acknowledgement 💌
161 | - [LLaVA-v1.5](https://github.com/haotian-liu/LLaVA): the codebase we built upon.
162 | - [Mini-Gemini](https://github.com/dvlab-research/MGM): the organized data we used for training high-resolution method.
163 |
164 | ## More ##
165 | For more recent related works, please refer to this repo of [Awesome-Token-Compress](https://github.com/daixiangzi/Awesome-Token-Compress).
166 |
167 | ## BibTeX 🖊️
168 | ```
169 | @misc{TokenPacker,
170 | title={TokenPacker: Efficient Visual Projector for Multimodal LLM},
171 | author={Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jianke Zhu and Lei Zhang},
172 | year={2024},
173 | eprint={2407.02392},
174 | archivePrefix={arXiv},
175 | primaryClass={cs.CV}
176 | }
177 | ```
178 |
--------------------------------------------------------------------------------
/assets/Algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/Algorithm.png
--------------------------------------------------------------------------------
/assets/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/compare.png
--------------------------------------------------------------------------------
/assets/ex1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex1.png
--------------------------------------------------------------------------------
/assets/ex2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex2.jpg
--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/framework.png
--------------------------------------------------------------------------------
/assets/hd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/hd.png
--------------------------------------------------------------------------------
/assets/high-reso.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/high-reso.jpg
--------------------------------------------------------------------------------
/assets/projector_comparsion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/projector_comparsion.jpg
--------------------------------------------------------------------------------
/assets/title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/title.png
--------------------------------------------------------------------------------
/assets/vis-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-1.jpg
--------------------------------------------------------------------------------
/assets/vis-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-2.jpg
--------------------------------------------------------------------------------
/docs/evaluation.md:
--------------------------------------------------------------------------------
1 | # Evaluation
2 |
3 | ## Docvqa
4 | 1. Download `test_v1.0.json` to `./playground/data/eval/docvqa/data`.
5 | 2. set `--image-folder` to the path of [docvqa](https://rrc.cvc.uab.es/?ch=17&com=downloads) images.
6 | 3. Multi-GPU inference.
7 | ```
8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/docvqa.sh
9 | ```
10 | 4. Submit the results to the [evaluation server](https://rrc.cvc.uab.es/?ch=17&com=evaluation&task=1): `./playground/data/eval/docvqa/answers/`
11 |
12 |
13 | ## GQA
14 | 1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
15 | Multi-GPU inference.
16 | 2. Multi-GPU inference.
17 | ```
18 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
19 | ```
20 |
21 | ## MMBench
22 | 1. Download [mmbench_dev_20230712.tsv](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
23 | 2. Single-GPU inference.
24 | ```
25 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
26 | ```
27 | 3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.
28 |
29 |
30 | ## MME
31 | 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
32 | 2. Downloaded images to `MME_Benchmark_release_version`.
33 | 3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
34 | 4. Single-GPU inference and evaluate.
35 | ```Shell
36 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
37 | ```
38 |
39 | ## MMMU_val
40 | 1. Download the [data](https://huggingface.co/datasets/MMMU/MMMU/tree/main).
41 | 2. Set `--data_path` to the path to MMMU images.
42 | 3. Multi-GPU inference.
43 | ```
44 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/mmmu_val.sh
45 | ```
46 |
47 | ## MM-Vet
48 | 1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
49 | 2. Single-GPU inference.
50 | ```Shell
51 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
52 | ```
53 | 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
54 |
55 | ## OCRBench
56 | 1. Download the [data](https://github.com/Yuliang-Liu/MultimodalOCR).
57 | 2. Set `--image_folder` to the path to OCRBench images, set `--OCRBench_file` to the json file of OCRBench.
58 | 3. Single-GPU inference.
59 | ```Shell
60 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/ocr_bench.sh
61 | ```
62 |
63 | ## POPE
64 | 1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
65 | 2. Single-GPU inference and evaluate.
66 | ```Shell
67 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
68 | ```
69 |
70 | ### TextVQA
71 | 1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and extract to `./playground/data/eval/textvqa`.
72 | 2. Download[images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and set `--image-folder` to the path to textvqa images.
73 | 2. Single-GPU inference and evaluate.
74 | ```Shell
75 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
76 | ```
77 |
78 | ## Vizwiz
79 | 1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
80 | 2. Single-GPU inference.
81 | ```Shell
82 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
83 | ```
84 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.
85 |
86 |
87 | ## VQAv2
88 | 1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and set `--image-folder` to the path to `test2015`.
89 | 2. Multi-GPU inference.
90 | ```Shell
91 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
92 | ```
93 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.
94 |
--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 |
--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 | IMAGE_TOKEN_INDEX = -200
9 | DEFAULT_IMAGE_TOKEN = ""
10 | DEFAULT_IMAGE_PATCH_TOKEN = ""
11 | DEFAULT_IM_START_TOKEN = ""
12 | DEFAULT_IM_END_TOKEN = ""
13 |
--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 |
5 | import openai
6 | import tqdm
7 | import ray
8 | import time
9 |
10 | NUM_SECONDS_TO_SLEEP = 3
11 |
12 | @ray.remote(num_cpus=4)
13 | def get_eval(content: str, max_tokens: int):
14 | while True:
15 | try:
16 | response = openai.ChatCompletion.create(
17 | model='gpt-4',
18 | messages=[{
19 | 'role': 'system',
20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
21 | }, {
22 | 'role': 'user',
23 | 'content': content,
24 | }],
25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation
26 | max_tokens=max_tokens,
27 | )
28 | break
29 | except openai.error.RateLimitError:
30 | pass
31 | except Exception as e:
32 | print(e)
33 | time.sleep(NUM_SECONDS_TO_SLEEP)
34 |
35 | print('success!')
36 | return response['choices'][0]['message']['content']
37 |
38 |
39 | def parse_score(review):
40 | try:
41 | score_pair = review.split('\n')[0]
42 | score_pair = score_pair.replace(',', ' ')
43 | sp = score_pair.split(' ')
44 | if len(sp) == 2:
45 | return [float(sp[0]), float(sp[1])]
46 | else:
47 | print('error', review)
48 | return [-1, -1]
49 | except Exception as e:
50 | print(e)
51 | print('error', review)
52 | return [-1, -1]
53 |
54 |
55 | if __name__ == '__main__':
56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
57 | parser.add_argument('-q', '--question')
58 | # parser.add_argument('-a', '--answer')
59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[])
60 | parser.add_argument('-r', '--rule')
61 | parser.add_argument('-o', '--output')
62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
63 | args = parser.parse_args()
64 |
65 | ray.init()
66 |
67 | f_q = open(os.path.expanduser(args.question))
68 | f_ans1 = open(os.path.expanduser(args.answer_list[0]))
69 | f_ans2 = open(os.path.expanduser(args.answer_list[1]))
70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
71 |
72 | review_file = open(f'{args.output}', 'w')
73 |
74 | js_list = []
75 | handles = []
76 | idx = 0
77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
78 | # if idx == 1:
79 | # break
80 |
81 | ques = json.loads(ques_js)
82 | ans1 = json.loads(ans1_js)
83 | ans2 = json.loads(ans2_js)
84 |
85 | category = json.loads(ques_js)['category']
86 | if category in rule_dict:
87 | rule = rule_dict[category]
88 | else:
89 | rule = rule_dict['default']
90 | prompt = rule['prompt']
91 | role = rule['role']
92 | content = (f'[Question]\n{ques["text"]}\n\n'
93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
95 | f'[System]\n{prompt}\n\n')
96 | js_list.append({
97 | 'id': idx+1,
98 | 'question_id': ques['question_id'],
99 | 'answer1_id': ans1['answer_id'],
100 | 'answer2_id': ans2['answer_id'],
101 | 'category': category})
102 | idx += 1
103 | handles.append(get_eval.remote(content, args.max_tokens))
104 | # To avoid the rate limit set by OpenAI
105 | time.sleep(NUM_SECONDS_TO_SLEEP)
106 |
107 | reviews = ray.get(handles)
108 | for idx, review in enumerate(reviews):
109 | scores = parse_score(review)
110 | js_list[idx]['content'] = review
111 | js_list[idx]['tuple'] = scores
112 | review_file.write(json.dumps(js_list[idx]) + '\n')
113 | review_file.close()
114 |
--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 |
5 | import openai
6 | import time
7 |
8 | NUM_SECONDS_TO_SLEEP = 0.5
9 |
10 |
11 | def get_eval(content: str, max_tokens: int):
12 | while True:
13 | try:
14 | response = openai.ChatCompletion.create(
15 | model='gpt-4-0314',
16 | messages=[{
17 | 'role': 'system',
18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
19 | }, {
20 | 'role': 'user',
21 | 'content': content,
22 | }],
23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation
24 | max_tokens=max_tokens,
25 | )
26 | break
27 | except openai.error.RateLimitError:
28 | pass
29 | except Exception as e:
30 | print(e)
31 | time.sleep(NUM_SECONDS_TO_SLEEP)
32 |
33 | return response['choices'][0]['message']['content']
34 |
35 |
36 | def parse_score(review):
37 | try:
38 | score_pair = review.split('\n')[0]
39 | score_pair = score_pair.replace(',', ' ')
40 | sp = score_pair.split(' ')
41 | if len(sp) == 2:
42 | return [float(sp[0]), float(sp[1])]
43 | else:
44 | print('error', review)
45 | return [-1, -1]
46 | except Exception as e:
47 | print(e)
48 | print('error', review)
49 | return [-1, -1]
50 |
51 |
52 | if __name__ == '__main__':
53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
54 | parser.add_argument('-q', '--question')
55 | parser.add_argument('-c', '--context')
56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[])
57 | parser.add_argument('-r', '--rule')
58 | parser.add_argument('-o', '--output')
59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
60 | args = parser.parse_args()
61 |
62 | f_q = open(os.path.expanduser(args.question))
63 | f_ans1 = open(os.path.expanduser(args.answer_list[0]))
64 | f_ans2 = open(os.path.expanduser(args.answer_list[1]))
65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
66 |
67 | if os.path.isfile(os.path.expanduser(args.output)):
68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
69 | else:
70 | cur_reviews = []
71 |
72 | review_file = open(f'{args.output}', 'a')
73 |
74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
75 | image_to_context = {context['image']: context for context in context_list}
76 |
77 | handles = []
78 | idx = 0
79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
80 | ques = json.loads(ques_js)
81 | ans1 = json.loads(ans1_js)
82 | ans2 = json.loads(ans2_js)
83 |
84 | inst = image_to_context[ques['image']]
85 |
86 | if isinstance(inst['caption'], list):
87 | cap_str = '\n'.join(inst['caption'])
88 | else:
89 | cap_str = inst['caption']
90 |
91 | category = 'llava_bench_' + json.loads(ques_js)['category']
92 | if category in rule_dict:
93 | rule = rule_dict[category]
94 | else:
95 | assert False, f"Visual QA category not found in rule file: {category}."
96 | prompt = rule['prompt']
97 | role = rule['role']
98 | content = (f'[Context]\n{cap_str}\n\n'
99 | f'[Question]\n{ques["text"]}\n\n'
100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102 | f'[System]\n{prompt}\n\n')
103 | cur_js = {
104 | 'id': idx+1,
105 | 'question_id': ques['question_id'],
106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']),
107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108 | 'category': category
109 | }
110 | if idx >= len(cur_reviews):
111 | review = get_eval(content, args.max_tokens)
112 | scores = parse_score(review)
113 | cur_js['content'] = review
114 | cur_js['tuple'] = scores
115 | review_file.write(json.dumps(cur_js) + '\n')
116 | review_file.flush()
117 | else:
118 | print(f'Skipping {idx} as we already have it.')
119 | idx += 1
120 | print(idx)
121 | review_file.close()
122 |
--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 |
5 | import openai
6 | import time
7 |
8 | NUM_SECONDS_TO_SLEEP = 0.5
9 |
10 |
11 | def get_eval(content: str, max_tokens: int):
12 | while True:
13 | try:
14 | response = openai.ChatCompletion.create(
15 | model='gpt-4-0314',
16 | messages=[{
17 | 'role': 'system',
18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
19 | }, {
20 | 'role': 'user',
21 | 'content': content,
22 | }],
23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation
24 | max_tokens=max_tokens,
25 | )
26 | break
27 | except openai.error.RateLimitError:
28 | pass
29 | except Exception as e:
30 | print(e)
31 | time.sleep(NUM_SECONDS_TO_SLEEP)
32 |
33 | return response['choices'][0]['message']['content']
34 |
35 |
36 | def parse_score(review):
37 | try:
38 | score_pair = review.split('\n')[0]
39 | score_pair = score_pair.replace(',', ' ')
40 | sp = score_pair.split(' ')
41 | if len(sp) == 2:
42 | return [float(sp[0]), float(sp[1])]
43 | else:
44 | print('error', review)
45 | return [-1, -1]
46 | except Exception as e:
47 | print(e)
48 | print('error', review)
49 | return [-1, -1]
50 |
51 |
52 | if __name__ == '__main__':
53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
54 | parser.add_argument('-q', '--question')
55 | parser.add_argument('-c', '--context')
56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[])
57 | parser.add_argument('-r', '--rule')
58 | parser.add_argument('-o', '--output')
59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
60 | args = parser.parse_args()
61 |
62 | f_q = open(os.path.expanduser(args.question))
63 | f_ans1 = open(os.path.expanduser(args.answer_list[0]))
64 | f_ans2 = open(os.path.expanduser(args.answer_list[1]))
65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
66 |
67 | if os.path.isfile(os.path.expanduser(args.output)):
68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
69 | else:
70 | cur_reviews = []
71 |
72 | review_file = open(f'{args.output}', 'a')
73 |
74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
75 | image_to_context = {context['image']: context for context in context_list}
76 |
77 | handles = []
78 | idx = 0
79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
80 | ques = json.loads(ques_js)
81 | ans1 = json.loads(ans1_js)
82 | ans2 = json.loads(ans2_js)
83 |
84 | inst = image_to_context[ques['image']]
85 | cap_str = '\n'.join(inst['captions'])
86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
87 |
88 | category = json.loads(ques_js)['category']
89 | if category in rule_dict:
90 | rule = rule_dict[category]
91 | else:
92 | assert False, f"Visual QA category not found in rule file: {category}."
93 | prompt = rule['prompt']
94 | role = rule['role']
95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
96 | f'[Question]\n{ques["text"]}\n\n'
97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
99 | f'[System]\n{prompt}\n\n')
100 | cur_js = {
101 | 'id': idx+1,
102 | 'question_id': ques['question_id'],
103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']),
104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105 | 'category': category
106 | }
107 | if idx >= len(cur_reviews):
108 | review = get_eval(content, args.max_tokens)
109 | scores = parse_score(review)
110 | cur_js['content'] = review
111 | cur_js['tuple'] = scores
112 | review_file.write(json.dumps(cur_js) + '\n')
113 | review_file.flush()
114 | else:
115 | print(f'Skipping {idx} as we already have it.')
116 | idx += 1
117 | print(idx)
118 | review_file.close()
119 |
--------------------------------------------------------------------------------
/llava/eval/eval_pope.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | def eval_pope(answers, label_file):
6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
7 |
8 | for answer in answers:
9 | text = answer['text']
10 |
11 | # Only keep the first sentence
12 | if text.find('.') != -1:
13 | text = text.split('.')[0]
14 |
15 | text = text.replace(',', '')
16 | words = text.split(' ')
17 | if 'No' in words or 'not' in words or 'no' in words:
18 | answer['text'] = 'no'
19 | else:
20 | answer['text'] = 'yes'
21 |
22 | for i in range(len(label_list)):
23 | if label_list[i] == 'no':
24 | label_list[i] = 0
25 | else:
26 | label_list[i] = 1
27 |
28 | pred_list = []
29 | for answer in answers:
30 | if answer['text'] == 'no':
31 | pred_list.append(0)
32 | else:
33 | pred_list.append(1)
34 |
35 | pos = 1
36 | neg = 0
37 | yes_ratio = pred_list.count(1) / len(pred_list)
38 |
39 | TP, TN, FP, FN = 0, 0, 0, 0
40 | for pred, label in zip(pred_list, label_list):
41 | if pred == pos and label == pos:
42 | TP += 1
43 | elif pred == pos and label == neg:
44 | FP += 1
45 | elif pred == neg and label == neg:
46 | TN += 1
47 | elif pred == neg and label == pos:
48 | FN += 1
49 |
50 | print('TP\tFP\tTN\tFN\t')
51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 |
53 | precision = float(TP) / float(TP + FP)
54 | recall = float(TP) / float(TP + FN)
55 | f1 = 2*precision*recall / (precision + recall)
56 | acc = (TP + TN) / (TP + TN + FP + FN)
57 | print('Accuracy: {}'.format(acc))
58 | print('Precision: {}'.format(precision))
59 | print('Recall: {}'.format(recall))
60 | print('F1 score: {}'.format(f1))
61 | print('Yes ratio: {}'.format(yes_ratio))
62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 |
64 | if __name__ == "__main__":
65 | parser = argparse.ArgumentParser()
66 | parser.add_argument("--annotation-dir", type=str)
67 | parser.add_argument("--question-file", type=str)
68 | parser.add_argument("--result-file", type=str)
69 | args = parser.parse_args()
70 |
71 | questions = [json.loads(line) for line in open(args.question_file)]
72 | questions = {question['question_id']: question for question in questions}
73 | answers = [json.loads(q) for q in open(args.result_file)]
74 | for file in os.listdir(args.annotation_dir):
75 | assert file.startswith('coco_pope_')
76 | assert file.endswith('.json')
77 | category = file[10:-5]
78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 | print("====================================")
82 |
--------------------------------------------------------------------------------
/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import json
4 | import re
5 |
6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
7 |
8 |
9 | def get_args():
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--annotation-file', type=str)
12 | parser.add_argument('--result-file', type=str)
13 | parser.add_argument('--result-dir', type=str)
14 | return parser.parse_args()
15 |
16 |
17 | def prompt_processor(prompt):
18 | if prompt.startswith('OCR tokens: '):
19 | pattern = r"Question: (.*?) Short answer:"
20 | match = re.search(pattern, prompt, re.DOTALL)
21 | question = match.group(1)
22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 | if prompt.startswith('Reference OCR token:'):
24 | question = prompt.split('\n')[1]
25 | else:
26 | question = prompt.split('\n')[0]
27 | elif len(prompt.split('\n')) == 2:
28 | question = prompt.split('\n')[0]
29 | else:
30 | assert False
31 |
32 | return question.lower()
33 |
34 |
35 | def eval_single(annotation_file, result_file):
36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 | print(experiment_name)
38 | annotations = json.load(open(annotation_file))['data']
39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 | results = [json.loads(line) for line in open(result_file)]
41 |
42 | pred_list = []
43 | for result in results:
44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 | pred_list.append({
46 | "pred_answer": result['text'],
47 | "gt_answers": annotation['answers'],
48 | })
49 |
50 | evaluator = TextVQAAccuracyEvaluator()
51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 |
53 |
54 | if __name__ == "__main__":
55 | args = get_args()
56 |
57 | if args.result_file is not None:
58 | eval_single(args.annotation_file, args.result_file)
59 |
60 | if args.result_dir is not None:
61 | for result_file in sorted(os.listdir(args.result_dir)):
62 | if not result_file.endswith('.jsonl'):
63 | print(f'Skipping {result_file}')
64 | continue
65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/README.md:
--------------------------------------------------------------------------------
1 | # Evaluation Guidelines
2 | We provide detailed instructions for evaluation.
3 | To execute our evaluation script, please ensure that the structure of your model outputs is the same as ours.
4 |
5 | We provide two options:
6 | 1. Evaluation only: you can parse the response on your own and simply provide one file with all the final predictions.
7 | 2. Parse and evaluation: you can leave all the responses to us with the output formats shown below.
8 |
9 | ## Evaluation Only
10 | If you want to use your own parsing logic and *only provide the final answer*, you can use `main_eval_only.py`.
11 |
12 | You can provide all the outputs in *one file* in the following format:
13 |
14 | ```
15 | {
16 | "validation_Accounting_1": "D", # strictly "A", "B", "C", "D" for multi-choice question
17 | "validation_Architecture_and_Engineering_14": "0.0", # any string response for open question.
18 | ...
19 | }
20 | ```
21 | Then run eval_only with:
22 | ```
23 | python main_eval_only.py --output_path ./example_outputs/llava1.5_13b/total_val_output.json
24 | ```
25 |
26 | Please refer to [example output](https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/example_outputs/llava1.5_13b/total_val_output.json) for a detailed prediction file form.
27 |
28 |
29 | ## Parse and Evaluation
30 | You can also provide response and run the `main_parse_and_eval.py` to use our answer parsing processing and evaluation pipeline as follows:
31 |
32 | ### Output folder structure
33 |
34 | ```
35 | └── model_name
36 | ├── category_name (e.g., Accounting)
37 | │ ├── output.json
38 | └── category_name (e.g., Electronics)
39 | ├── output.json
40 | ...
41 | ```
42 |
43 | ### Output file
44 | Each `output.json`` has a list of dict containing instances for evaluation ().
45 | ```
46 | [
47 | {
48 | "id": "validation_Electronics_28",
49 | "question_type": "multiple-choice",
50 | "answer": "A", # given answer
51 | "all_choices": [ # create using `get_multi_choice_info` in
52 | "A",
53 | "B",
54 | "C",
55 | "D"
56 | ],
57 | "index2ans": { # create using `get_multi_choice_info` in
58 | "A": "75 + 13.3 cos(250t - 57.7°)V",
59 | "B": "75 + 23.3 cos(250t - 57.7°)V",
60 | "C": "45 + 3.3 cos(250t - 57.7°)V",
61 | "D": "95 + 13.3 cos(250t - 57.7°)V"
62 | },
63 | "response": "B" # model response
64 | },
65 | {
66 | "id": "validation_Electronics_29",
67 | "question_type": "short-answer",
68 | "answer": "30", # given answer
69 | "response": "36 watts" # model response
70 | },
71 | ...
72 | ]
73 | ```
74 |
75 | ### Evaluation
76 | ```
77 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject ALL # all subject
78 |
79 | # OR you can sepecify one subject for the evaluation
80 |
81 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject elec # short name for Electronics. use --help for all short names
82 |
83 | ```
84 |
85 | `main_parse_and_eval.py` will generate `parsed_output.json` and `result.json` in the subfolder under the same category with output.json, respectively.
86 |
87 | ```
88 | ├── Accounting
89 | │ ├── output.json
90 | │ ├── parsed_output.json
91 | │ └── result.json
92 | └── Electronics
93 | ├── output.json
94 | ├── parsed_output.json
95 | └── result.json
96 | ...
97 | ```
98 |
99 | ### Print Results
100 | You can print results locally if you want. (use `pip install tabulate` if you haven't)
101 | ```
102 | python print_results.py --path ./example_outputs/llava1.5_13b
103 | # Results may be slightly different due to the ramdon selection for fail response
104 | ```
105 |
106 |
107 |
108 | ##### Run Llava
109 | In case if you want to reproduce the results of some of the models, please go check run_llava.py as an example.
110 |
111 | By seeting up the env following the [llava official repo](https://github.com/haotian-liu/LLaVA) and installing `datasets` packages by huggingface, you can run llava viathe following command:
112 |
113 | ```
114 | CUDA_VISIBLE_DEVICES=0 nohup python run_llava.py \
115 | --output_path example_outputs/llava1.5_13b_val.json \
116 | --model_path liuhaotian/llava-v1.5-13b \
117 | --config_path configs/llava1.5.yaml
118 | ```
119 |
120 | Then you can evaluate the results via the very first pipeline.
121 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/configs/llava1.5.yaml:
--------------------------------------------------------------------------------
1 | task_instructions:
2 | - ""
3 | multi_choice_example_format:
4 | - "{}
5 |
6 | {}
7 |
8 | Answer with the option's letter from the given choices directly."
9 |
10 | short_ans_example_format:
11 | - "{}
12 |
13 | Answer the question using a single word or phrase."
14 | temperature:
15 | - 0
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/convert_to_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from argparse import ArgumentParser
4 |
5 | from utils.eval_utils import evaluate
6 | from utils.data_utils import save_json
7 |
8 |
9 | def main():
10 | parser = ArgumentParser()
11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 | help='name of saved json')
13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 | help='name of saved json')
15 |
16 | args = parser.parse_args()
17 | out_samples = [json.loads(line) for line in open(args.result_file)]
18 | out_json = {}
19 | for _sample in out_samples:
20 | _result = _sample['parsed_pred']
21 | if isinstance(_result, list):
22 | _result = str(_result[0])
23 | out_json[_sample['id']] = _result
24 |
25 | save_json(args.output_path, out_json)
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from argparse import ArgumentParser
4 |
5 | from utils.eval_utils import evaluate
6 | from utils.data_utils import save_json
7 |
8 |
9 | def main():
10 | parser = ArgumentParser()
11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 | help='name of saved json')
13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 | help='name of saved json')
15 |
16 | args = parser.parse_args()
17 | out_samples = [json.loads(line) for line in open(args.result_file)]
18 |
19 | judge_dict, metric_dict = evaluate(out_samples)
20 | metric_dict.update({"num_example": len(out_samples)})
21 | judge_dict['metric_dict'] = metric_dict
22 | save_dir = '/'.join(args.output_path.split('/')[:-1])
23 | if not os.path.exists(save_dir):
24 | os.makedirs(save_dir)
25 | save_json(args.output_path, judge_dict)
26 |
27 | print(metric_dict)
28 |
29 |
30 | if __name__ == '__main__':
31 | main()
32 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/main_eval_only.py:
--------------------------------------------------------------------------------
1 | """Parse and Evalate"""
2 | import os
3 | import json
4 |
5 | import pdb
6 | from argparse import ArgumentParser
7 |
8 | from utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
9 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc
10 |
11 |
12 | if __name__ == '__main__':
13 |
14 | parser = ArgumentParser()
15 | parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.")
16 | parser.add_argument('--answer_path', type=str, default="./answer_dict_val.json", help="Answer file path.")
17 | args = parser.parse_args()
18 |
19 | output_dict = json.load(open(args.output_path))
20 | answer_dict = json.load(open(args.answer_path))
21 |
22 | # group by category
23 | output_dict_w_cat = {}
24 | for data_id, parsed_pred in output_dict.items():
25 | category = "_".join(data_id.split("_")[1:-1])
26 | if category not in output_dict_w_cat:
27 | output_dict_w_cat.update({category: {}})
28 | output_dict_w_cat[category].update({data_id: parsed_pred})
29 |
30 | # group by category
31 | answer_dict_w_cat = {}
32 | for data_id, parsed_pred in answer_dict.items():
33 | category = "_".join(data_id.split("_")[1:-1])
34 | if category not in answer_dict_w_cat:
35 | answer_dict_w_cat.update({category: {}})
36 | answer_dict_w_cat[category].update({data_id: parsed_pred})
37 |
38 | evaluation_result = {}
39 |
40 | for category in CAT_SHORT2LONG.values():
41 | print("Evaluating: {}".format(category))
42 | # get cat_outputs and cat_answers
43 | try:
44 | cat_outputs = output_dict_w_cat[category]
45 | cat_answers = answer_dict_w_cat[category]
46 | except KeyError:
47 | print("Skipping {} for not found".format(category))
48 | continue
49 |
50 | exampels_to_eval = []
51 | for data_id, parsed_pred in cat_outputs.items():
52 | question_type = cat_answers[data_id]['question_type']
53 | if question_type != 'multiple-choice':
54 | parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
55 | else:
56 | parsed_pred = parsed_pred
57 |
58 | exampels_to_eval.append({
59 | "id": data_id,
60 | "question_type": question_type,
61 | "answer": cat_answers[data_id]['ground_truth'],
62 | "parsed_pred": parsed_pred
63 | })
64 |
65 | judge_dict, metric_dict = evaluate(exampels_to_eval)
66 | metric_dict.update({"num_example": len(exampels_to_eval)})
67 |
68 | evaluation_result[category] = metric_dict
69 |
70 | printable_results = {}
71 | # pdb.set_trace()
72 | # add domain Subject
73 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
74 | in_domain_cat_results = {}
75 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
76 | if cat_name in evaluation_result.keys():
77 | in_domain_cat_results[cat_name] = evaluation_result[cat_name]
78 | else:
79 | pass
80 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
81 | in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
82 | printable_results['Overall-' + domain] = {"num": int(in_domain_data_num),
83 | "acc": round(in_domain_ins_acc, 3)
84 | }
85 | # add sub category
86 | for cat_name, cat_results in in_domain_cat_results.items():
87 | printable_results[cat_name] = {"num": int(cat_results['num_example']),
88 | "acc": round(cat_results['acc'], 3)
89 | }
90 |
91 | # table.append(["-----------------------------", "-----", "----"])
92 | all_ins_acc = calculate_ins_level_acc(evaluation_result)
93 | printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
94 | "acc": round(all_ins_acc, 3)
95 | }
96 |
97 | print(printable_results)
98 |
99 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/main_parse_and_eval.py:
--------------------------------------------------------------------------------
1 | """Parse and Evalate"""
2 | import os
3 | import json
4 | from argparse import ArgumentParser
5 |
6 | from utils.data_utils import save_json, CAT_SHORT2LONG
7 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response
8 |
9 |
10 | if __name__ == '__main__':
11 |
12 | parser = ArgumentParser()
13 | parser.add_argument('--path', type=str, default="./example_outputs/llava1.5_13b", help="The path to model output directory.")
14 | parser.add_argument('--subject', nargs='+',
15 | help=f'The name of the mmmu sub-category. Availble: {CAT_SHORT2LONG.keys()} or ALL')
16 |
17 | args = parser.parse_args()
18 | if args.subject[0] == 'ALL':
19 | args.subject = CAT_SHORT2LONG.keys()
20 |
21 | ex_output_path = os.path.join(args.path)
22 |
23 | all_results = {}
24 | for cat_short in args.subject:
25 | category = CAT_SHORT2LONG[cat_short]
26 | print("Evaluating: {}".format(category))
27 | if category not in os.listdir(ex_output_path):
28 | print("Skipping {} for not found".format(category))
29 | else:
30 | cat_folder_path = os.path.join(ex_output_path, category)
31 | cat_outputs = json.load(open(os.path.join(cat_folder_path, 'output.json')))
32 | # Evaluation
33 | eval_samples = []
34 | for cat_output in cat_outputs:
35 | response = cat_output['response']
36 | if cat_output['question_type'] == 'multiple-choice':
37 | all_choices = cat_output['all_choices']
38 | index2ans = cat_output['index2ans']
39 | parsed_pred = parse_multi_choice_response(response, all_choices, index2ans)
40 | eval_samples.append(
41 | {
42 | 'id': cat_output['id'],
43 | 'question_type': cat_output['question_type'],
44 | 'answer': cat_output['answer'], # the content in option, not answer index.
45 | 'response': response,
46 | 'parsed_pred': parsed_pred,
47 | 'index2ans': index2ans,
48 | }
49 | )
50 | else: # open
51 | parsed_pred = parse_open_response(response)
52 | eval_samples.append(
53 | {
54 | 'id': cat_output['id'],
55 | 'question_type': cat_output['question_type'],
56 | 'answer': cat_output['answer'],
57 | 'response': response,
58 | 'parsed_pred': parsed_pred,
59 | }
60 | )
61 |
62 | print("Num of valid samples: {}, Expected Num: {}".format(len(eval_samples), len(cat_outputs)))
63 |
64 | judge_dict, metric_dict = evaluate(eval_samples)
65 | metric_dict.update({"num_example": len(eval_samples)})
66 | for eval_sample in eval_samples:
67 | eval_sample.update({"judge": judge_dict[eval_sample['id']]})
68 |
69 | save_json(os.path.join(cat_folder_path, 'parsed_output.json'), eval_samples)
70 | save_json(os.path.join(cat_folder_path, 'result.json'), metric_dict)
71 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/print_results.py:
--------------------------------------------------------------------------------
1 | # Beautiful table to print results of all categories
2 |
3 | import os
4 | from typing import Dict
5 | import json
6 | import numpy as np
7 | from tabulate import tabulate
8 |
9 | from argparse import ArgumentParser
10 |
11 | from utils.data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
12 |
13 | from utils.eval_utils import calculate_ins_level_acc
14 |
15 | def main():
16 | parser = ArgumentParser()
17 | parser.add_argument('--path', type=str, default="./example_outputs/blip2_flant5xxl", help="The path to output directory.")
18 | args = parser.parse_args()
19 |
20 | # load all results
21 | all_results = {}
22 | for cat_folder_name in os.listdir(args.path):
23 | if cat_folder_name in CAT_SHORT2LONG.values():
24 | cat_folder_path = os.path.join(args.path, cat_folder_name)
25 | result_path = os.path.join(cat_folder_path, 'result.json')
26 | if os.path.exists(result_path):
27 | cat_results = json.load(open(result_path))
28 | all_results[cat_folder_name] = cat_results
29 |
30 | # print results
31 | headers = ['Subject', 'Data Num', 'Acc']
32 | table = []
33 |
34 | # add domain Subject
35 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
36 | in_domain_cat_results = {}
37 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
38 | if cat_name in all_results.keys():
39 | in_domain_cat_results[cat_name] = all_results[cat_name]
40 | else:
41 | pass
42 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
43 | in_domain_data_num = np.sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
44 | table.append(['Overall-' + domain, int(in_domain_data_num), round(in_domain_ins_acc, 3)])
45 | # add sub category
46 | for cat_name, cat_results in in_domain_cat_results.items():
47 | table.append([cat_name, int(cat_results['num_example']), round(cat_results['acc'], 3)])
48 | # table.append(["-----------------------------", "-----", "----"])
49 |
50 | # table.append(["-----------------------------", "-----", "----"])
51 | all_ins_acc = calculate_ins_level_acc(all_results)
52 | table.append(['Overall', np.sum([cat_results['num_example'] for cat_results in all_results.values()]), round(all_ins_acc, 3)])
53 |
54 | print(tabulate(table, headers=headers, tablefmt='orgtbl'))
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/run_llava.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | import random
4 |
5 | import numpy as np
6 | import math
7 | from tqdm import tqdm
8 | import json
9 |
10 | from datasets import load_dataset, concatenate_datasets
11 | from argparse import ArgumentParser
12 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
13 | from llava.model import *
14 | from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
15 | from utils.data_utils import load_yaml, construct_prompt, save_json, process_single_sample, CAT_SHORT2LONG
16 | from utils.model_utils_ind import call_llava_engine_df
17 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response
18 | import torch.nn.functional as F
19 | from functools import partial
20 | from llava.patch_divide import Image_Patch
21 | from torchvision.transforms import Compose, ToTensor, Normalize
22 |
23 | def set_seed(seed_value):
24 | """
25 | Set the seed for PyTorch (both CPU and CUDA), Python, and NumPy for reproducible results.
26 |
27 | :param seed_value: An integer value to be used as the seed.
28 | """
29 | torch.manual_seed(seed_value)
30 | if torch.cuda.is_available():
31 | torch.cuda.manual_seed(seed_value)
32 | torch.cuda.manual_seed_all(seed_value) # For multi-GPU setups
33 | random.seed(seed_value)
34 | np.random.seed(seed_value)
35 | torch.backends.cudnn.deterministic = True
36 | torch.backends.cudnn.benchmark = False
37 |
38 | def split_list(lst, n):
39 | """Split a list into n (roughly) equal-sized chunks"""
40 | chunk_size = math.ceil(len(lst) / n) # integer division
41 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
42 |
43 |
44 | def get_chunk(lst, n, k):
45 | chunks = split_list(lst, n)
46 | return chunks[k]
47 |
48 |
49 | def main():
50 | parser = ArgumentParser()
51 | # parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
52 | # help='name of saved json')
53 | parser.add_argument('--config_path', type=str, default="configs/llava1.5.yaml")
54 | parser.add_argument('--data_path', type=str, default="MMMU/MMMU") # hf dataset path.
55 | parser.add_argument('--model_path', type=str, default="liuhaotian/llava-v1.5-13b")
56 | parser.add_argument("--conv-mode", type=str, default="llava_v1")
57 | parser.add_argument("--num-chunks", type=int, default=1)
58 | parser.add_argument("--chunk-idx", type=int, default=0)
59 | parser.add_argument("--answers-file", type=str, default="answer.jsonl")
60 | parser.add_argument('--split', type=str, default='validation')
61 | parser.add_argument('--seed', type=int, default=42)
62 | parser.add_argument('--load_8bit', type=bool, default=False)
63 |
64 | args = parser.parse_args()
65 | # device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
66 | set_seed(args.seed)
67 |
68 | print('llava_initializing...')
69 | processor = None
70 | call_model_engine = call_llava_engine_df
71 |
72 | # load config and process to one value
73 | args.config = load_yaml(args.config_path)
74 | for key, value in args.config.items():
75 | if key != 'eval_params' and type(value) == list:
76 | assert len(value) == 1, 'key {} has more than one value'.format(key)
77 | args.config[key] = value[0]
78 |
79 | model_path = os.path.expanduser(args.model_path)
80 | model_name = get_model_name_from_path(model_path)
81 | tokenizer = AutoTokenizer.from_pretrained(
82 | args.model_path,
83 | model_max_length = 2048,
84 | padding_side="right",
85 | use_fast = True
86 | )
87 | model = LlavaLlamaForCausalLM.from_pretrained(
88 | args.model_path,
89 | torch_dtype=torch.bfloat16,
90 | ).cuda()
91 |
92 | for m in model.modules():
93 | m.tokenizer = tokenizer
94 |
95 | vision_tower = model.get_vision_tower()
96 | if not vision_tower.is_loaded:
97 | vision_tower.load_model()
98 | vision_tower.to(device='cuda', dtype=torch.float16)
99 | image_processor = vision_tower.image_processor
100 |
101 | patch_num = getattr(model.config, 'patch_num', '9')
102 | image_patch = Image_Patch(patch_num=int(patch_num))
103 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
104 |
105 |
106 | # run for each subject
107 | sub_dataset_list = []
108 | subjects = [x for x in CAT_SHORT2LONG.values()]
109 | '''
110 | subjects = [
111 | 'Architecture_and_Engineering', 'Computer_Science', 'Electronics',
112 | 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'
113 | ]
114 | '''
115 | for subject in tqdm(subjects):
116 | sub_dataset = load_dataset(args.data_path, subject, split=args.split)
117 | sub_dataset_list.append(sub_dataset)
118 |
119 | sub_dataset_list = get_chunk(sub_dataset_list, args.num_chunks, args.chunk_idx)
120 |
121 | # merge all dataset
122 | dataset = concatenate_datasets(sub_dataset_list)
123 |
124 | # samples = []
125 | out_samples = []
126 | for sample in tqdm(dataset):
127 | sample = process_single_sample(sample)
128 |
129 | sample = construct_prompt(sample, args.config)
130 | if sample['image']:
131 | image = sample['image'].convert('RGB')
132 | if model.config.image_aspect_ratio == 'slice':
133 | image = preprocess(image)
134 | image = image.unsqueeze(0)
135 | h, w = image.shape[-2:]
136 | block_size = 336
137 | h_block, w_block = image_patch.calculate(h, w)
138 | h_ratio = block_size*h_block/h
139 | w_ratio = block_size*w_block/w
140 | if h_ratio<=w_ratio:
141 | w_ = min(block_size*w_block, round(w*h_ratio))
142 | h_ = block_size*h_block
143 | else:
144 | w_ = block_size*w_block
145 | h_ = min(block_size*h_block, round(h*w_ratio))
146 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
147 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
148 | image[:, :, :h_, :w_] = image_inter
149 |
150 | split_images = []
151 | for i_ in range(h_block):
152 | for j_ in range(w_block):
153 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
154 | split_images.append(image_s)
155 | if len(split_images)>1:
156 | h_ratio = block_size/h
157 | w_ratio = block_size/w
158 | if h_ratio<=w_ratio:
159 | w_ = min(block_size, round(w*h_ratio))
160 | h_ = block_size
161 | else:
162 | w_ = block_size
163 | h_ = min(block_size, round(h*w_ratio))
164 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
165 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
166 | image_s[:, :, :h_, :w_] = image_inter
167 | split_images.append(image_s)
168 | image_tensor = torch.cat(split_images, dim=0)
169 | else:
170 | image_tensor = process_images([image], image_processor, model.config)[0]
171 | image_tensor = image_tensor.unsqueeze(0)
172 | h_block = 1
173 | w_block = 1
174 |
175 | sample['image'] = image_tensor
176 |
177 | # samples.append(sample)
178 | mode = model.config.image_aspect_ratio
179 | with torch.no_grad():
180 | response = call_model_engine(args, sample, model, tokenizer, processor, h_block, w_block, mode)
181 | if sample['question_type'] == 'multiple-choice':
182 | parsed_pred = parse_multi_choice_response(response, sample['all_choices'], sample['index2ans'])
183 | out_sample = {
184 | 'id': sample['id'],
185 | 'question_type': sample['question_type'],
186 | 'answer': sample['answer'],
187 | 'response': response,
188 | 'parsed_pred': parsed_pred,
189 | 'index2ans': sample['index2ans'],
190 | }
191 | else: # open question
192 | parsed_pred = parse_open_response(response)
193 | out_sample = {
194 | 'id': sample['id'],
195 | 'question_type': sample['question_type'],
196 | 'answer': sample['answer'],
197 | 'response': response,
198 | 'parsed_pred': parsed_pred,
199 | }
200 | out_samples.append(out_sample)
201 |
202 | answers_file = os.path.expanduser(args.answers_file)
203 | os.makedirs(os.path.dirname(answers_file), exist_ok=True)
204 | ans_file = open(answers_file, "w")
205 | for i, sample in enumerate(out_samples):
206 | ans_file.write(json.dumps(sample) + "\n")
207 | ans_file.close()
208 |
209 | if __name__ == '__main__':
210 | main()
211 |
212 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/data_utils.py:
--------------------------------------------------------------------------------
1 | """Utils for data load, save, and process (e.g., prompt construction)"""
2 |
3 | import os
4 | import json
5 | import yaml
6 | import re
7 |
8 |
9 | DOMAIN_CAT2SUB_CAT = {
10 | 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'],
11 | 'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'],
12 | 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',],
13 | 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'],
14 | 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'],
15 | 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'],
16 | }
17 |
18 |
19 | CAT_SHORT2LONG = {
20 | 'acc': 'Accounting',
21 | 'agri': 'Agriculture',
22 | 'arch': 'Architecture_and_Engineering',
23 | 'art': 'Art',
24 | 'art_theory': 'Art_Theory',
25 | 'bas_med': 'Basic_Medical_Science',
26 | 'bio': 'Biology',
27 | 'chem': 'Chemistry',
28 | 'cli_med': 'Clinical_Medicine',
29 | 'cs': 'Computer_Science',
30 | 'design': 'Design',
31 | 'diag_med': 'Diagnostics_and_Laboratory_Medicine',
32 | 'econ': 'Economics',
33 | 'elec': 'Electronics',
34 | 'ep': 'Energy_and_Power',
35 | 'fin': 'Finance',
36 | 'geo': 'Geography',
37 | 'his': 'History',
38 | 'liter': 'Literature',
39 | 'manage': 'Manage',
40 | 'mark': 'Marketing',
41 | 'mate': 'Materials',
42 | 'math': 'Math',
43 | 'mech': 'Mechanical_Engineering',
44 | 'music': 'Music',
45 | 'phar': 'Pharmacy',
46 | 'phys': 'Physics',
47 | 'psy': 'Psychology',
48 | 'pub_health': 'Public_Health',
49 | 'socio': 'Sociology'
50 | }
51 |
52 | # DATA SAVING
53 | def save_json(filename, ds):
54 | with open(filename, 'w') as f:
55 | json.dump(ds, f, indent=4)
56 |
57 |
58 | def get_multi_choice_info(options):
59 | """
60 | Given the list of options for multiple choice question
61 | Return the index2ans and all_choices
62 | """
63 |
64 | start_chr = 'A'
65 | all_choices = []
66 | index2ans = {}
67 | for i, option in enumerate(options):
68 | index2ans[chr(ord(start_chr) + i)] = option
69 | all_choices.append(chr(ord(start_chr) + i))
70 |
71 | return index2ans, all_choices
72 |
73 | def load_yaml(file_path):
74 | with open(file_path, 'r') as stream:
75 | try:
76 | yaml_dict = yaml.safe_load(stream)
77 | except yaml.YAMLError as exc:
78 | print(exc)
79 |
80 | return yaml_dict
81 |
82 |
83 | def parse_img_path(text):
84 | matches = re.findall("
", text)
85 | return matches
86 |
87 | def process_single_sample(data):
88 | question = data['question']
89 | o_imgs_paths = []
90 | for option in data['options']:
91 | current_o_imgs_paths = parse_img_path(option)
92 | for img_path in current_o_imgs_paths:
93 | o_imgs_paths.append(img_path)
94 |
95 | if len(o_imgs_paths) > 1: # multiple images in options, used for random selection
96 | return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
97 | 'image': None, 'question_type': data['question_type']}
98 | else:
99 | return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'],
100 | 'image': data['image_1'], 'question_type': data['question_type']}
101 |
102 |
103 | # DATA SAVING
104 | def save_json(filename, ds):
105 | with open(filename, 'w') as f:
106 | json.dump(ds, f, indent=4)
107 |
108 | def save_jsonl(filename, data):
109 | """
110 | Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
111 |
112 | Args:
113 | filename (str): The path to the file where the data should be saved.
114 | data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
115 | """
116 | with open(filename, 'w', encoding='utf-8') as f:
117 | for img_path, caption in data.items():
118 | # Extract the base filename without the extension
119 | base_filename = os.path.basename(img_path)
120 | # Create a JSON object with the filename as the key and caption as the value
121 | json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
122 | # Write the JSON object to the file, one per line
123 | f.write(json_record + '\n')
124 |
125 | def save_args(args, path_dir):
126 | argsDict = args.__dict__
127 | with open(path_dir + 'setting.txt', 'w') as f:
128 | f.writelines('------------------ start ------------------' + '\n')
129 | for eachArg, value in argsDict.items():
130 | f.writelines(eachArg + ' : ' + str(value) + '\n')
131 | f.writelines('------------------- end -------------------')
132 |
133 |
134 |
135 | # DATA PROCESSING
136 | def construct_prompt(sample, config):
137 | question = sample['question']
138 | options = eval(sample['options'])
139 | example = ""
140 | if sample['question_type'] == 'multiple-choice':
141 | start_chr = 'A'
142 | prediction_range = []
143 | index2ans = {}
144 | for option in options:
145 | prediction_range.append(start_chr)
146 | example += f"({start_chr}) {option}\n"
147 | index2ans[start_chr] = option
148 | start_chr = chr(ord(start_chr) + 1)
149 | empty_prompt_sample_structure = config['multi_choice_example_format']
150 | empty_prompt = empty_prompt_sample_structure.format(question, example)
151 | res_dict = {}
152 | res_dict['index2ans'] = index2ans
153 | res_dict['correct_choice'] = sample['answer']
154 | res_dict['all_choices'] = prediction_range
155 | res_dict['empty_prompt'] = empty_prompt
156 | if config['task_instructions']:
157 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
158 | else:
159 | res_dict['final_input_prompt'] = empty_prompt
160 |
161 | res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')]
162 | else:
163 | empty_prompt_sample_structure = config['short_ans_example_format']
164 | empty_prompt = empty_prompt_sample_structure.format(question)
165 | res_dict = {}
166 | res_dict['empty_prompt'] = empty_prompt
167 | if config['task_instructions']:
168 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt
169 | else:
170 | res_dict['final_input_prompt'] = empty_prompt
171 | res_dict['gt_content'] = sample['answer']
172 |
173 | res_dict.update(sample)
174 | return res_dict
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/model_utils.py:
--------------------------------------------------------------------------------
1 | from random import random
2 | import torch
3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
4 | from llava.conversation import conv_templates, SeparatorStyle
5 |
6 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None):
7 |
8 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
9 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')]
10 |
11 | def insert_separator(X, sep):
12 | return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
13 |
14 | input_ids = []
15 | offset = 0
16 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
17 | offset = 1
18 | input_ids.append(prompt_chunks[0][0])
19 |
20 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
21 | input_ids.extend(x[offset:])
22 |
23 | if return_tensors is not None:
24 | if return_tensors == 'pt':
25 | return torch.tensor(input_ids, dtype=torch.long)
26 | raise ValueError(f'Unsupported tensor type: {return_tensors}')
27 | return input_ids
28 |
29 | def deal_with_prompt(input_text, mm_use_im_start_end):
30 | qs = input_text
31 | if mm_use_im_start_end:
32 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
33 | else:
34 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
35 | return qs
36 |
37 | prompt = sample['final_input_prompt']
38 | prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end)
39 | conv = conv_templates['vicuna_v1'].copy()
40 | conv.append_message(conv.roles[0], prompt)
41 | conv.append_message(conv.roles[1], None)
42 | prompt = conv.get_prompt()
43 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
44 | image = sample['image']
45 | if image is not None:
46 | output_ids = model.generate(
47 | input_ids,
48 | images=image.unsqueeze(0).half().cuda(),
49 | do_sample=True,
50 | temperature=1,
51 | top_p=None,
52 | num_beams=5,
53 | max_new_tokens=128,
54 | use_cache=True)
55 |
56 | # input_token_len = input_ids.shape[1]
57 | # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
58 | # if n_diff_input_output > 0:
59 | # print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
60 | # response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
61 | response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
62 | else: # multiple images actually
63 | if sample['question_type'] == 'multiple-choice':
64 | all_choices = sample['all_choices']
65 | response = random.choice(all_choices)
66 | else:
67 | response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
68 |
69 | return response
70 |
71 |
72 | def llava_image_processor(raw_image, vis_processors=None):
73 | image_tensor = vis_processors.preprocess(raw_image, return_tensors='pt')['pixel_values'][0]
74 | return image_tensor
75 |
--------------------------------------------------------------------------------
/llava/eval/mmmu/eval/utils/model_utils_ind.py:
--------------------------------------------------------------------------------
1 | from random import random
2 | import torch
3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
4 | from llava.conversation import conv_templates, SeparatorStyle
5 | from functools import partial
6 | from llava.mm_utils import tokenizer_image_token
7 |
8 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None, h_block=None, w_block=None, mode=None):
9 |
10 | def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens):
11 | if ocr_tokens is not None:
12 | qs = input_text + '\n' + ocr_tokens
13 | else:
14 | qs = input_text
15 | if mm_use_im_start_end:
16 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
17 | else:
18 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
19 | return qs
20 |
21 | prompt = sample['final_input_prompt']
22 | ocr_tokens = sample.get('ocr', None)
23 | prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens)
24 | conv = conv_templates[args.conv_mode].copy()
25 | conv.append_message(conv.roles[0], prompt)
26 | conv.append_message(conv.roles[1], None)
27 | prompt = conv.get_prompt()
28 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
29 | image = sample['image']
30 |
31 | if image is not None:
32 | model.orig_forward = model.forward
33 | model.forward = partial(model.orig_forward,
34 | mode=mode,
35 | h_block = [h_block],
36 | w_block = [w_block]
37 | )
38 | output_ids = model.generate(
39 | input_ids,
40 | images=image.bfloat16().cuda(),
41 | do_sample=False,
42 | temperature=0,
43 | num_beams=1,
44 | top_p=None,
45 | max_new_tokens=1024,
46 | use_cache=True)
47 |
48 | model.forward = model.orig_forward
49 |
50 | input_token_len = input_ids.shape[1]
51 |
52 | response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
53 | # response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n')
54 | else: # multiple images actually
55 | if sample['question_type'] == 'multiple-choice':
56 | all_choices = sample['all_choices']
57 | response = random.choice(all_choices)
58 | else:
59 | response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
60 |
61 | return response
62 |
--------------------------------------------------------------------------------
/llava/eval/model_qa.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
3 | import torch
4 | import os
5 | import json
6 | from tqdm import tqdm
7 | import shortuuid
8 |
9 | from llava.conversation import default_conversation
10 | from llava.utils import disable_torch_init
11 |
12 |
13 | # new stopping implementation
14 | class KeywordsStoppingCriteria(StoppingCriteria):
15 | def __init__(self, keywords, tokenizer, input_ids):
16 | self.keywords = keywords
17 | self.tokenizer = tokenizer
18 | self.start_len = None
19 | self.input_ids = input_ids
20 |
21 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
22 | if self.start_len is None:
23 | self.start_len = self.input_ids.shape[1]
24 | else:
25 | outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
26 | for keyword in self.keywords:
27 | if keyword in outputs:
28 | return True
29 | return False
30 |
31 |
32 | @torch.inference_mode()
33 | def eval_model(model_name, questions_file, answers_file):
34 | # Model
35 | disable_torch_init()
36 | model_name = os.path.expanduser(model_name)
37 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
38 | model = AutoModelForCausalLM.from_pretrained(model_name,
39 | torch_dtype=torch.float16).cuda()
40 |
41 |
42 | ques_file = open(os.path.expanduser(questions_file), "r")
43 | ans_file = open(os.path.expanduser(answers_file), "w")
44 | for i, line in enumerate(tqdm(ques_file)):
45 | idx = json.loads(line)["question_id"]
46 | qs = json.loads(line)["text"]
47 | cat = json.loads(line)["category"]
48 | conv = default_conversation.copy()
49 | conv.append_message(conv.roles[0], qs)
50 | prompt = conv.get_prompt()
51 | inputs = tokenizer([prompt])
52 | input_ids = torch.as_tensor(inputs.input_ids).cuda()
53 | stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids)
54 | output_ids = model.generate(
55 | input_ids,
56 | do_sample=True,
57 | use_cache=True,
58 | temperature=0.7,
59 | max_new_tokens=1024,
60 | stopping_criteria=[stopping_criteria])
61 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
62 | try:
63 | index = outputs.index(conv.sep, len(prompt))
64 | except ValueError:
65 | outputs += conv.sep
66 | index = outputs.index(conv.sep, len(prompt))
67 |
68 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
69 | ans_id = shortuuid.uuid()
70 | ans_file.write(json.dumps({"question_id": idx,
71 | "text": outputs,
72 | "answer_id": ans_id,
73 | "model_id": model_name,
74 | "metadata": {}}) + "\n")
75 | ans_file.flush()
76 | ans_file.close()
77 |
78 | if __name__ == "__main__":
79 | parser = argparse.ArgumentParser()
80 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
81 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
82 | parser.add_argument("--answers-file", type=str, default="answer.jsonl")
83 | args = parser.parse_args()
84 |
85 | eval_model(args.model_name, args.question_file, args.answers_file)
86 |
--------------------------------------------------------------------------------
/llava/eval/model_vqa.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 | import os
4 | import json
5 | from tqdm import tqdm
6 | import shortuuid
7 |
8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9 | from llava.conversation import conv_templates, SeparatorStyle
10 | from llava.utils import disable_torch_init
11 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path, KeywordsStoppingCriteria
12 |
13 | from PIL import Image
14 | import math
15 | import torch.nn.functional as F
16 | from functools import partial
17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
18 | from llava.model import *
19 | from llava.patch_divide import Image_Patch
20 | from torchvision.transforms import Compose, ToTensor, Normalize
21 |
22 |
23 | def split_list(lst, n):
24 | """Split a list into n (roughly) equal-sized chunks"""
25 | chunk_size = math.ceil(len(lst) / n) # integer division
26 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
27 |
28 |
29 | def get_chunk(lst, n, k):
30 | chunks = split_list(lst, n)
31 | return chunks[k]
32 |
33 |
34 | def eval_model(args):
35 | # Model
36 | disable_torch_init()
37 | model_path = os.path.expanduser(args.model_path)
38 | model_name = get_model_name_from_path(model_path)
39 | tokenizer = AutoTokenizer.from_pretrained(
40 | model_path,
41 | model_max_length = 2048,
42 | padding_side="right",
43 | use_fast = True
44 | )
45 |
46 | model = LlavaLlamaForCausalLM.from_pretrained(
47 | model_path,
48 | torch_dtype=torch.bfloat16,
49 | ).cuda()
50 |
51 | for m in model.modules():
52 | m.tokenizer = tokenizer
53 |
54 | vision_tower = model.get_vision_tower()
55 | if not vision_tower.is_loaded:
56 | vision_tower.load_model()
57 | vision_tower.to(device='cuda', dtype=torch.float16)
58 | image_processor = vision_tower.image_processor
59 |
60 | patch_num = getattr(model.config, 'patch_num', '9')
61 | image_patch = Image_Patch(patch_num=int(patch_num))
62 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
63 |
64 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
65 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
66 | answers_file = os.path.expanduser(args.answers_file)
67 | os.makedirs(os.path.dirname(answers_file), exist_ok=True)
68 | ans_file = open(answers_file, "w")
69 | for line in tqdm(questions):
70 | idx = line["question_id"]
71 | image_file = line["image"]
72 | qs = line["text"]
73 | cur_prompt = qs
74 | if model.config.mm_use_im_start_end:
75 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
76 | else:
77 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
78 |
79 | conv = conv_templates[args.conv_mode].copy()
80 | conv.append_message(conv.roles[0], qs)
81 | conv.append_message(conv.roles[1], None)
82 | prompt = conv.get_prompt()
83 |
84 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
85 |
86 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
87 | if model.config.image_aspect_ratio == 'slice':
88 | image = preprocess(image)
89 | image = image.unsqueeze(0)
90 | h, w = image.shape[-2:]
91 | block_size = 336
92 | h_block, w_block = image_patch.calculate(h, w)
93 | h_ratio = block_size*h_block/h
94 | w_ratio = block_size*w_block/w
95 | if h_ratio<=w_ratio:
96 | w_ = min(block_size*w_block, round(w*h_ratio))
97 | h_ = block_size*h_block
98 | else:
99 | w_ = block_size*w_block
100 | h_ = min(block_size*h_block, round(h*w_ratio))
101 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
102 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
103 | image[:, :, :h_, :w_] = image_inter
104 |
105 | split_images = []
106 | for i_ in range(h_block):
107 | for j_ in range(w_block):
108 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
109 | split_images.append(image_s)
110 | if len(split_images)>1:
111 | h_ratio = block_size/h
112 | w_ratio = block_size/w
113 | if h_ratio<=w_ratio:
114 | w_ = min(block_size, round(w*h_ratio))
115 | h_ = block_size
116 | else:
117 | w_ = block_size
118 | h_ = min(block_size, round(h*w_ratio))
119 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
120 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
121 | image_s[:, :, :h_, :w_] = image_inter
122 | split_images.append(image_s)
123 | image_tensor = torch.cat(split_images, dim=0)
124 | else:
125 | image_tensor = process_images([image], image_processor, model.config)[0]
126 | image_tensor = image_tensor.unsqueeze(0)
127 | h_block = 1
128 | w_block = 1
129 |
130 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
131 | keywords = [stop_str]
132 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
133 | mode = model.config.image_aspect_ratio
134 |
135 | with torch.inference_mode():
136 | model.orig_forward = model.forward
137 | model.forward = partial(model.orig_forward,
138 | mode=mode,
139 | h_block=[h_block],
140 | w_block=[w_block]
141 | )
142 |
143 |
144 | output_ids = model.generate(
145 | input_ids,
146 | images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
147 | do_sample=True if args.temperature > 0 else False,
148 | temperature=args.temperature,
149 | top_p=args.top_p,
150 | num_beams=args.num_beams,
151 | # no_repeat_ngram_size=3,
152 | max_new_tokens=1024,
153 | use_cache=True)
154 |
155 | model.forward = model.orig_forward
156 |
157 | input_token_len = input_ids.shape[1]
158 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
159 | if n_diff_input_output > 0:
160 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
161 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
162 | outputs = outputs.strip()
163 | if outputs.endswith(stop_str):
164 | outputs = outputs[:-len(stop_str)]
165 | outputs = outputs.strip()
166 |
167 | ans_id = shortuuid.uuid()
168 | ans_file.write(json.dumps({"question_id": idx,
169 | "prompt": cur_prompt,
170 | "text": outputs,
171 | "answer_id": ans_id,
172 | "model_id": model_name,
173 | "metadata": {}}) + "\n")
174 | ans_file.flush()
175 | ans_file.close()
176 |
177 | if __name__ == "__main__":
178 | parser = argparse.ArgumentParser()
179 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
180 | parser.add_argument("--model-base", type=str, default=None)
181 | parser.add_argument("--image-folder", type=str, default="")
182 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
183 | parser.add_argument("--answers-file", type=str, default="answer.jsonl")
184 | parser.add_argument("--conv-mode", type=str, default="llava_v1")
185 | parser.add_argument("--num-chunks", type=int, default=1)
186 | parser.add_argument("--chunk-idx", type=int, default=0)
187 | parser.add_argument("--temperature", type=float, default=0.2)
188 | parser.add_argument("--top_p", type=float, default=None)
189 | parser.add_argument("--num_beams", type=int, default=1)
190 | args = parser.parse_args()
191 |
192 | eval_model(args)
193 |
--------------------------------------------------------------------------------
/llava/eval/run_llava.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 |
4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
5 | from llava.conversation import conv_templates, SeparatorStyle
6 | from llava.model.builder import load_pretrained_model
7 | from llava.utils import disable_torch_init
8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
9 |
10 | from PIL import Image
11 |
12 | import requests
13 | from PIL import Image
14 | from io import BytesIO
15 |
16 |
17 | def load_image(image_file):
18 | if image_file.startswith('http') or image_file.startswith('https'):
19 | response = requests.get(image_file)
20 | image = Image.open(BytesIO(response.content)).convert('RGB')
21 | else:
22 | image = Image.open(image_file).convert('RGB')
23 | return image
24 |
25 |
26 | def eval_model(args):
27 | # Model
28 | disable_torch_init()
29 |
30 | model_name = get_model_name_from_path(args.model_path)
31 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name)
32 |
33 | qs = args.query
34 | if model.config.mm_use_im_start_end:
35 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
36 | else:
37 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
38 |
39 | if 'llama-2' in model_name.lower():
40 | conv_mode = "llava_llama_2"
41 | elif "v1" in model_name.lower():
42 | conv_mode = "llava_v1"
43 | elif "mpt" in model_name.lower():
44 | conv_mode = "mpt"
45 | else:
46 | conv_mode = "llava_v0"
47 |
48 | if args.conv_mode is not None and conv_mode != args.conv_mode:
49 | print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
50 | else:
51 | args.conv_mode = conv_mode
52 |
53 | conv = conv_templates[args.conv_mode].copy()
54 | conv.append_message(conv.roles[0], qs)
55 | conv.append_message(conv.roles[1], None)
56 | prompt = conv.get_prompt()
57 |
58 | image = load_image(args.image_file)
59 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
60 |
61 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
62 |
63 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
64 | keywords = [stop_str]
65 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
66 |
67 | with torch.inference_mode():
68 | output_ids = model.generate(
69 | input_ids,
70 | images=image_tensor,
71 | do_sample=True,
72 | temperature=0.2,
73 | max_new_tokens=1024,
74 | use_cache=True,
75 | stopping_criteria=[stopping_criteria])
76 |
77 | input_token_len = input_ids.shape[1]
78 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
79 | if n_diff_input_output > 0:
80 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
81 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
82 | outputs = outputs.strip()
83 | if outputs.endswith(stop_str):
84 | outputs = outputs[:-len(stop_str)]
85 | outputs = outputs.strip()
86 | print(outputs)
87 |
88 | if __name__ == "__main__":
89 | parser = argparse.ArgumentParser()
90 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
91 | parser.add_argument("--model-base", type=str, default=None)
92 | parser.add_argument("--image-file", type=str, required=True)
93 | parser.add_argument("--query", type=str, required=True)
94 | parser.add_argument("--conv-mode", type=str, default=None)
95 | args = parser.parse_args()
96 |
97 | eval_model(args)
98 |
--------------------------------------------------------------------------------
/llava/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from collections import defaultdict
4 |
5 | import numpy as np
6 |
7 | import argparse
8 |
9 | def parse_args():
10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 | parser.add_argument('-d', '--dir', default=None)
12 | parser.add_argument('-v', '--version', default=None)
13 | parser.add_argument('-s', '--select', nargs='*', default=None)
14 | parser.add_argument('-f', '--files', nargs='*', default=[])
15 | parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 | return parser.parse_args()
17 |
18 |
19 | if __name__ == '__main__':
20 | args = parse_args()
21 |
22 | if args.ignore is not None:
23 | args.ignore = [int(x) for x in args.ignore]
24 |
25 | if len(args.files) > 0:
26 | review_files = args.files
27 | else:
28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 |
30 | for review_file in sorted(review_files):
31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 | if args.select is not None and any(x not in config for x in args.select):
33 | continue
34 | if '0613' in config:
35 | version = '0613'
36 | else:
37 | version = '0314'
38 | if args.version is not None and args.version != version:
39 | continue
40 | scores = defaultdict(list)
41 | print(config)
42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 | for review_str in f:
44 | review = json.loads(review_str)
45 | if review['question_id'] in args.ignore:
46 | continue
47 | if 'category' in review:
48 | scores[review['category']].append(review['tuple'])
49 | scores['all'].append(review['tuple'])
50 | else:
51 | if 'tuple' in review:
52 | scores['all'].append(review['tuple'])
53 | else:
54 | scores['all'].append(review['score'])
55 | for k, v in sorted(scores.items()):
56 | stats = np.asarray(v).mean(0).tolist()
57 | stats = [round(x, 3) for x in stats]
58 | # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 | print('=================================')
61 |
--------------------------------------------------------------------------------
/llava/mm_utils.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | from io import BytesIO
3 | import base64
4 |
5 | import torch
6 | from transformers import StoppingCriteria
7 | from llava.constants import IMAGE_TOKEN_INDEX
8 |
9 |
10 | def load_image_from_base64(image):
11 | return Image.open(BytesIO(base64.b64decode(image)))
12 |
13 |
14 | def expand2square(pil_img, background_color):
15 | width, height = pil_img.size
16 | if width == height:
17 | return pil_img
18 | elif width > height:
19 | result = Image.new(pil_img.mode, (width, width), background_color)
20 | result.paste(pil_img, (0, (width - height) // 2))
21 | return result
22 | else:
23 | result = Image.new(pil_img.mode, (height, height), background_color)
24 | result.paste(pil_img, ((height - width) // 2, 0))
25 | return result
26 |
27 |
28 | def process_images(images, image_processor, model_cfg):
29 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
30 | new_images = []
31 | if image_aspect_ratio == 'pad':
32 | for image in images:
33 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
34 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
35 | new_images.append(image)
36 | else:
37 | return image_processor(images, return_tensors='pt')['pixel_values']
38 | if all(x.shape == new_images[0].shape for x in new_images):
39 | new_images = torch.stack(new_images, dim=0)
40 | return new_images
41 |
42 |
43 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
44 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')]
45 |
46 | def insert_separator(X, sep):
47 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
48 |
49 | input_ids = []
50 | offset = 0
51 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
52 | offset = 1
53 | input_ids.append(prompt_chunks[0][0])
54 |
55 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
56 | input_ids.extend(x[offset:])
57 |
58 | if return_tensors is not None:
59 | if return_tensors == 'pt':
60 | return torch.tensor(input_ids, dtype=torch.long)
61 | raise ValueError(f'Unsupported tensor type: {return_tensors}')
62 | return input_ids
63 |
64 |
65 | def get_model_name_from_path(model_path):
66 | model_path = model_path.strip("/")
67 | model_paths = model_path.split("/")
68 | if model_paths[-1].startswith('checkpoint-'):
69 | return model_paths[-2] + "_" + model_paths[-1]
70 | else:
71 | return model_paths[-1]
72 |
73 |
74 |
75 |
76 | class KeywordsStoppingCriteria(StoppingCriteria):
77 | def __init__(self, keywords, tokenizer, input_ids):
78 | self.keywords = keywords
79 | self.keyword_ids = []
80 | self.max_keyword_len = 0
81 | for keyword in keywords:
82 | cur_keyword_ids = tokenizer(keyword).input_ids
83 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
84 | cur_keyword_ids = cur_keyword_ids[1:]
85 | if len(cur_keyword_ids) > self.max_keyword_len:
86 | self.max_keyword_len = len(cur_keyword_ids)
87 | self.keyword_ids.append(torch.tensor(cur_keyword_ids))
88 | self.tokenizer = tokenizer
89 | self.start_len = input_ids.shape[1]
90 |
91 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
92 | assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO
93 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
94 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
95 | for keyword_id in self.keyword_ids:
96 | if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
97 | return True
98 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
99 | for keyword in self.keywords:
100 | if keyword in outputs:
101 | return True
102 | return False
--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 |
--------------------------------------------------------------------------------
/llava/model/builder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Haotian Liu
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import os
17 | import warnings
18 | import shutil
19 |
20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
21 | import torch
22 | from llava.model import *
23 | from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
24 |
25 |
26 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
27 | kwargs = {"device_map": device_map}
28 |
29 | if load_8bit:
30 | kwargs['load_in_8bit'] = True
31 | elif load_4bit:
32 | kwargs['load_in_4bit'] = True
33 | kwargs['quantization_config'] = BitsAndBytesConfig(
34 | load_in_4bit=True,
35 | bnb_4bit_compute_dtype=torch.float16,
36 | bnb_4bit_use_double_quant=True,
37 | bnb_4bit_quant_type='nf4'
38 | )
39 | else:
40 | kwargs['torch_dtype'] = torch.float16
41 |
42 | if 'llava' in model_name.lower():
43 | # Load LLaVA model
44 | if 'lora' in model_name.lower() and model_base is None:
45 | warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
46 | if 'lora' in model_name.lower() and model_base is not None:
47 | lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
48 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
49 | print('Loading LLaVA from base model...')
50 | model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
51 | token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
52 | if model.lm_head.weight.shape[0] != token_num:
53 | model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
54 | model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
55 |
56 | print('Loading additional LLaVA weights...')
57 | if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
58 | non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
59 | else:
60 | # this is probably from HF Hub
61 | from huggingface_hub import hf_hub_download
62 | def load_from_hf(repo_id, filename, subfolder=None):
63 | cache_file = hf_hub_download(
64 | repo_id=repo_id,
65 | filename=filename,
66 | subfolder=subfolder)
67 | return torch.load(cache_file, map_location='cpu')
68 | non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
69 | non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
70 | if any(k.startswith('model.model.') for k in non_lora_trainables):
71 | non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
72 | model.load_state_dict(non_lora_trainables, strict=False)
73 |
74 | from peft import PeftModel
75 | print('Loading LoRA weights...')
76 | model = PeftModel.from_pretrained(model, model_path)
77 | print('Merging LoRA weights...')
78 | model = model.merge_and_unload()
79 | print('Model is loaded...')
80 | elif model_base is not None:
81 | # this may be mm projector only
82 | print('Loading LLaVA from base model...')
83 | if 'mpt' in model_name.lower():
84 | if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
85 | shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
86 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
87 | cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
88 | model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
89 | else:
90 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
91 | cfg_pretrained = AutoConfig.from_pretrained(model_path)
92 | model = LlavaLlamaForCausalLM.from_pretrained(
93 | model_base,
94 | # torch_dtype=torch.bfloat16,
95 | ).cuda()
96 | # model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
97 |
98 | mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
99 | mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
100 | model.load_state_dict(mm_projector_weights, strict=False)
101 | else:
102 | if 'mpt' in model_name.lower():
103 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
104 | model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
105 | else:
106 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
107 | model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
108 | else:
109 | # Load language model
110 | if model_base is not None:
111 | # PEFT model
112 | from peft import PeftModel
113 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
114 | model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
115 | print(f"Loading LoRA weights from {model_path}")
116 | model = PeftModel.from_pretrained(model, model_path)
117 | print(f"Merging weights")
118 | model = model.merge_and_unload()
119 | print('Convert to FP16...')
120 | model.to(torch.float16)
121 | else:
122 | use_fast = False
123 | if 'mpt' in model_name.lower():
124 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
125 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
126 | else:
127 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
128 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
129 |
130 | image_processor = None
131 |
132 | if 'llava' in model_name.lower():
133 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
134 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
135 | if mm_use_im_patch_token:
136 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
137 | if mm_use_im_start_end:
138 | tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
139 | model.resize_token_embeddings(len(tokenizer))
140 |
141 | vision_tower = model.get_vision_tower()
142 | if not vision_tower.is_loaded:
143 | vision_tower.load_model()
144 | vision_tower.to(device=device, dtype=torch.float16)
145 | image_processor = vision_tower.image_processor
146 |
147 | if hasattr(model.config, "max_sequence_length"):
148 | context_len = model.config.max_sequence_length
149 | else:
150 | context_len = 2048
151 |
152 | return tokenizer, model, image_processor, context_len
153 |
--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4 | """
5 | import argparse
6 |
7 | import torch
8 | from transformers import AutoTokenizer, AutoModelForCausalLM
9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 |
12 |
13 | def consolidate_ckpt(src_path, dst_path):
14 | print("Loading model")
15 | auto_upgrade(src_path)
16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 | src_model.save_pretrained(dst_path)
19 | src_tokenizer.save_pretrained(dst_path)
20 |
21 |
22 | if __name__ == "__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--src", type=str, required=True)
25 | parser.add_argument("--dst", type=str, required=True)
26 |
27 | args = parser.parse_args()
28 |
29 | consolidate_ckpt(args.src, args.dst)
30 |
--------------------------------------------------------------------------------
/llava/model/language_model/llava_llama.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Haotian Liu
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from typing import List, Optional, Tuple, Union
17 |
18 | import torch
19 | import torch.nn as nn
20 | from torch.nn import CrossEntropyLoss
21 |
22 | from transformers import AutoConfig, AutoModelForCausalLM, \
23 | LlamaConfig, LlamaModel, LlamaForCausalLM
24 |
25 | from transformers.modeling_outputs import CausalLMOutputWithPast
26 |
27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
28 |
29 |
30 | class LlavaConfig(LlamaConfig):
31 | model_type = "llava"
32 |
33 |
34 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
35 | config_class = LlavaConfig
36 |
37 | def __init__(self, config: LlamaConfig):
38 | super(LlavaLlamaModel, self).__init__(config)
39 |
40 |
41 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
42 | config_class = LlavaConfig
43 |
44 | def __init__(self, config):
45 | super(LlamaForCausalLM, self).__init__(config)
46 | self.model = LlavaLlamaModel(config)
47 |
48 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49 |
50 | # Initialize weights and apply final processing
51 | self.post_init()
52 |
53 | def get_model(self):
54 | return self.model
55 |
56 | def forward(
57 | self,
58 | input_ids: torch.LongTensor = None,
59 | attention_mask: Optional[torch.Tensor] = None,
60 | past_key_values: Optional[List[torch.FloatTensor]] = None,
61 | inputs_embeds: Optional[torch.FloatTensor] = None,
62 | labels: Optional[torch.LongTensor] = None,
63 | use_cache: Optional[bool] = None,
64 | output_attentions: Optional[bool] = None,
65 | output_hidden_states: Optional[bool] = None,
66 | images: Optional[torch.FloatTensor] = None,
67 | return_dict: Optional[bool] = None,
68 | mode = None,
69 | h_block = None,
70 | w_block = None
71 | ) -> Union[Tuple, CausalLMOutputWithPast]:
72 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
73 | output_hidden_states = (
74 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
75 | )
76 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict
77 |
78 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, mode, h_block, w_block)
79 |
80 | # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
81 | outputs = self.model(
82 | input_ids=input_ids,
83 | attention_mask=attention_mask,
84 | past_key_values=past_key_values,
85 | inputs_embeds=inputs_embeds,
86 | use_cache=use_cache,
87 | output_attentions=output_attentions,
88 | output_hidden_states=output_hidden_states,
89 | return_dict=return_dict
90 | )
91 |
92 | hidden_states = outputs[0]
93 | logits = self.lm_head(hidden_states)
94 |
95 | loss = None
96 | if labels is not None:
97 | # Shift so that tokens < n predict n
98 | shift_logits = logits[..., :-1, :].contiguous()
99 | shift_labels = labels[..., 1:].contiguous()
100 | # Flatten the tokens
101 | loss_fct = CrossEntropyLoss()
102 | shift_logits = shift_logits.view(-1, self.config.vocab_size)
103 | shift_labels = shift_labels.view(-1)
104 | # Enable model/pipeline parallelism
105 | shift_labels = shift_labels.to(shift_logits.device)
106 | loss = loss_fct(shift_logits, shift_labels)
107 |
108 | if not return_dict:
109 | output = (logits,) + outputs[1:]
110 | return (loss,) + output if loss is not None else output
111 |
112 | return CausalLMOutputWithPast(
113 | loss=loss,
114 | logits=logits,
115 | past_key_values=outputs.past_key_values,
116 | hidden_states=outputs.hidden_states,
117 | attentions=outputs.attentions,
118 | )
119 |
120 | def prepare_inputs_for_generation(
121 | self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
122 | ):
123 | if past_key_values:
124 | input_ids = input_ids[:, -1:]
125 |
126 | # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
127 | if inputs_embeds is not None and past_key_values is None:
128 | model_inputs = {"inputs_embeds": inputs_embeds}
129 | else:
130 | model_inputs = {"input_ids": input_ids}
131 |
132 | model_inputs.update(
133 | {
134 | "past_key_values": past_key_values,
135 | "use_cache": kwargs.get("use_cache"),
136 | "attention_mask": attention_mask,
137 | "images": kwargs.get("images", None),
138 | }
139 | )
140 | return model_inputs
141 |
142 | AutoConfig.register("llava", LlavaConfig)
143 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
144 |
--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Haotian Liu
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from typing import List, Optional, Tuple
17 | import warnings
18 |
19 | import torch
20 | import torch.nn.functional as F
21 | import math
22 |
23 | from transformers import AutoConfig, AutoModelForCausalLM
24 | from transformers.modeling_outputs import CausalLMOutputWithPast
25 |
26 | from .mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel
27 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
28 |
29 |
30 | class LlavaMPTConfig(MPTConfig):
31 | model_type = "llava_mpt"
32 |
33 |
34 | class LlavaMPTModel(LlavaMetaModel, MPTModel):
35 | config_class = LlavaMPTConfig
36 |
37 | def __init__(self, config: MPTConfig):
38 | config.hidden_size = config.d_model
39 | super(LlavaMPTModel, self).__init__(config)
40 |
41 | def embed_tokens(self, x):
42 | return self.wte(x)
43 |
44 |
45 | class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM):
46 | config_class = LlavaMPTConfig
47 | supports_gradient_checkpointing = True
48 |
49 | def __init__(self, config):
50 | super(MPTForCausalLM, self).__init__(config)
51 |
52 | if not config.tie_word_embeddings:
53 | raise ValueError('MPTForCausalLM only supports tied word embeddings')
54 | self.transformer = LlavaMPTModel(config)
55 | self.logit_scale = None
56 | if config.logit_scale is not None:
57 | logit_scale = config.logit_scale
58 | if isinstance(logit_scale, str):
59 | if logit_scale == 'inv_sqrt_d_model':
60 | logit_scale = 1 / math.sqrt(config.d_model)
61 | else:
62 | raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
63 | self.logit_scale = logit_scale
64 |
65 | def get_model(self):
66 | return self.transformer
67 |
68 | def _set_gradient_checkpointing(self, module, value=False):
69 | if isinstance(module, LlavaMPTModel):
70 | module.gradient_checkpointing = value
71 |
72 | def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
73 | return_dict = return_dict if return_dict is not None else self.config.return_dict
74 | use_cache = use_cache if use_cache is not None else self.config.use_cache
75 |
76 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
77 | outputs = self.transformer(input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
78 | # FIXME: this is a hack to fix the multiple gpu inference issue in https://github.com/haotian-liu/LLaVA/issues/338
79 | logits = F.linear(outputs.last_hidden_state.to(self.transformer.wte.weight.device), self.transformer.wte.weight)
80 | if self.logit_scale is not None:
81 | if self.logit_scale == 0:
82 | warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
83 | logits *= self.logit_scale
84 | loss = None
85 | if labels is not None:
86 | labels = torch.roll(labels, shifts=-1)
87 | labels[:, -1] = -100
88 | loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
89 | return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
90 |
91 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
92 | if inputs_embeds is not None:
93 | raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
94 | attention_mask = kwargs['attention_mask'].bool()
95 | if attention_mask[:, -1].sum() != attention_mask.shape[0]:
96 | raise NotImplementedError('MPT does not support generation with right padding.')
97 | if self.transformer.attn_uses_sequence_id and self.training:
98 | sequence_id = torch.zeros_like(input_ids[:1])
99 | else:
100 | sequence_id = None
101 | if past_key_values is not None:
102 | input_ids = input_ids[:, -1].unsqueeze(-1)
103 | if self.transformer.prefix_lm:
104 | prefix_mask = torch.ones_like(attention_mask)
105 | if kwargs.get('use_cache') == False:
106 | raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
107 | else:
108 | prefix_mask = None
109 | return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)}
110 |
111 |
112 | AutoConfig.register("llava_mpt", LlavaMPTConfig)
113 | AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)
114 |
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
4 | NUM_SENTINEL_TOKENS: int = 100
5 |
6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
7 | """Adds sentinel tokens and padding token (if missing).
8 |
9 | Expands the tokenizer vocabulary to include sentinel tokens
10 | used in mixture-of-denoiser tasks as well as a padding token.
11 |
12 | All added tokens are added as special tokens. No tokens are
13 | added if sentinel tokens and padding token already exist.
14 | """
15 | sentinels_to_add = [f'' for i in range(NUM_SENTINEL_TOKENS)]
16 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
17 | if tokenizer.pad_token is None:
18 | tokenizer.add_tokens('', special_tokens=True)
19 | tokenizer.pad_token = ''
20 | assert tokenizer.pad_token_id is not None
21 | sentinels = ''.join([f'' for i in range(NUM_SENTINEL_TOKENS)])
22 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
23 | tokenizer.sentinel_token_ids = _sentinel_token_ids
24 |
25 | class AutoTokenizerForMOD(AutoTokenizer):
26 | """AutoTokenizer + Adaptation for MOD.
27 |
28 | A simple wrapper around AutoTokenizer to make instantiating
29 | an MOD-adapted tokenizer a bit easier.
30 |
31 | MOD-adapted tokenizers have sentinel tokens (e.g., ),
32 | a padding token, and a property to get the token ids of the
33 | sentinel tokens.
34 | """
35 |
36 | @classmethod
37 | def from_pretrained(cls, *args, **kwargs):
38 | """See `AutoTokenizer.from_pretrained` docstring."""
39 | tokenizer = super().from_pretrained(*args, **kwargs)
40 | adapt_tokenizer_for_denoising(tokenizer)
41 | return tokenizer
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/blocks.py:
--------------------------------------------------------------------------------
1 | """GPT Blocks used for the GPT Model."""
2 | from typing import Dict, Optional, Tuple
3 | import torch
4 | import torch.nn as nn
5 | from .attention import ATTN_CLASS_REGISTRY
6 | from .norm import NORM_CLASS_REGISTRY
7 |
8 | class MPTMLP(nn.Module):
9 |
10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
11 | super().__init__()
12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13 | self.act = nn.GELU(approximate='none')
14 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
15 | self.down_proj._is_residual = True
16 |
17 | def forward(self, x):
18 | return self.down_proj(self.act(self.up_proj(x)))
19 |
20 | class MPTBlock(nn.Module):
21 |
22 | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
23 | del kwargs
24 | super().__init__()
25 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
26 | attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
27 | self.norm_1 = norm_class(d_model, device=device)
28 | self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
29 | self.norm_2 = norm_class(d_model, device=device)
30 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
31 | self.resid_attn_dropout = nn.Dropout(resid_pdrop)
32 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
33 |
34 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
35 | a = self.norm_1(x)
36 | (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
37 | x = x + self.resid_attn_dropout(b)
38 | m = self.norm_2(x)
39 | n = self.ffn(m)
40 | x = x + self.resid_ffn_dropout(n)
41 | return (x, attn_weights, past_key_value)
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/configuration_mpt.py:
--------------------------------------------------------------------------------
1 | """A HuggingFace-style model configuration."""
2 | from typing import Dict, Optional, Union
3 | from transformers import PretrainedConfig
4 | attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
5 | init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
6 |
7 | class MPTConfig(PretrainedConfig):
8 | model_type = 'mpt'
9 |
10 | def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
11 | """The MPT configuration class.
12 |
13 | Args:
14 | d_model (int): The size of the embedding dimension of the model.
15 | n_heads (int): The number of attention heads.
16 | n_layers (int): The number of layers in the model.
17 | expansion_ratio (int): The ratio of the up/down scale in the MLP.
18 | max_seq_len (int): The maximum sequence length of the model.
19 | vocab_size (int): The size of the vocabulary.
20 | resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
21 | emb_pdrop (float): The dropout probability for the embedding layer.
22 | learned_pos_emb (bool): Whether to use learned positional embeddings
23 | attn_config (Dict): A dictionary used to configure the model's attention module:
24 | attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
25 | attn_pdrop (float): The dropout probability for the attention layers.
26 | attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
27 | qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
28 | clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
29 | this value.
30 | softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
31 | use the default scale of ``1/sqrt(d_keys)``.
32 | prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
33 | extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
34 | can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
35 | attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
36 | When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
37 | which sub-sequence each token belongs to.
38 | Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
39 | alibi (bool): Whether to use the alibi bias instead of position embeddings.
40 | alibi_bias_max (int): The maximum value of the alibi bias.
41 | init_device (str): The device to use for parameter initialization.
42 | logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
43 | no_bias (bool): Whether to use bias in all layers.
44 | verbose (int): The verbosity level. 0 is silent.
45 | embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
46 | norm_type (str): choose type of norm to use
47 | multiquery_attention (bool): Whether to use multiquery attention implementation.
48 | use_cache (bool): Whether or not the model should return the last key/values attentions
49 | init_config (Dict): A dictionary used to configure the model initialization:
50 | init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
51 | 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
52 | 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
53 | init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
54 | emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
55 | emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
56 | used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
57 | init_std (float): The standard deviation of the normal distribution used to initialize the model,
58 | if using the baseline_ parameter initialization scheme.
59 | init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
60 | fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
61 | init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
62 | ---
63 | See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
64 | """
65 | self.d_model = d_model
66 | self.n_heads = n_heads
67 | self.n_layers = n_layers
68 | self.expansion_ratio = expansion_ratio
69 | self.max_seq_len = max_seq_len
70 | self.vocab_size = vocab_size
71 | self.resid_pdrop = resid_pdrop
72 | self.emb_pdrop = emb_pdrop
73 | self.learned_pos_emb = learned_pos_emb
74 | self.attn_config = attn_config
75 | self.init_device = init_device
76 | self.logit_scale = logit_scale
77 | self.no_bias = no_bias
78 | self.verbose = verbose
79 | self.embedding_fraction = embedding_fraction
80 | self.norm_type = norm_type
81 | self.use_cache = use_cache
82 | self.init_config = init_config
83 | if 'name' in kwargs:
84 | del kwargs['name']
85 | if 'loss_fn' in kwargs:
86 | del kwargs['loss_fn']
87 | super().__init__(**kwargs)
88 | self._validate_config()
89 |
90 | def _set_config_defaults(self, config, config_defaults):
91 | for (k, v) in config_defaults.items():
92 | if k not in config:
93 | config[k] = v
94 | return config
95 |
96 | def _validate_config(self):
97 | self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
98 | self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
99 | if self.d_model % self.n_heads != 0:
100 | raise ValueError('d_model must be divisible by n_heads')
101 | if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
102 | raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
103 | if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
104 | raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
105 | if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
106 | raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
107 | if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
108 | raise NotImplementedError('alibi only implemented with torch and triton attention.')
109 | if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
110 | raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
111 | if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
112 | raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
113 | if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
114 | raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
115 | if self.init_config.get('name', None) is None:
116 | raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
117 | if not self.learned_pos_emb and (not self.attn_config['alibi']):
118 | raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch import Tensor
5 |
6 | class SharedEmbedding(nn.Embedding):
7 |
8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
9 | if unembed:
10 | return F.linear(input, self.weight)
11 | return super().forward(input)
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/meta_init_context.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | import torch
3 | import torch.nn as nn
4 |
5 | @contextmanager
6 | def init_empty_weights(include_buffers: bool=False):
7 | """Meta initialization context manager.
8 |
9 | A context manager under which models are initialized with all parameters
10 | on the meta device, therefore creating an empty model. Useful when just
11 | initializing the model would blow the available RAM.
12 |
13 | Args:
14 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or
15 | not to also put all buffers on the meta device while initializing.
16 |
17 | Example:
18 | ```python
19 | import torch.nn as nn
20 |
21 | # Initialize a model with 100 billions parameters in no time and without using any RAM.
22 | with init_empty_weights():
23 | tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
24 | ```
25 |
26 |
27 |
28 | Any model created under this context manager has no weights. As such you can't do something like
29 | `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
30 |
31 |
32 | """
33 | with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
34 | yield f
35 |
36 | @contextmanager
37 | def init_on_device(device: torch.device, include_buffers: bool=False):
38 | """Device initialization context manager.
39 |
40 | A context manager under which models are initialized with all parameters
41 | on the specified device.
42 |
43 | Args:
44 | device (`torch.device`): Device to initialize all parameters on.
45 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or
46 | not to also put all buffers on the meta device while initializing.
47 |
48 | Example:
49 | ```python
50 | import torch.nn as nn
51 |
52 | with init_on_device(device=torch.device("cuda")):
53 | tst = nn.Liner(100, 100) # on `cuda` device
54 | ```
55 | """
56 | old_register_parameter = nn.Module.register_parameter
57 | if include_buffers:
58 | old_register_buffer = nn.Module.register_buffer
59 |
60 | def register_empty_parameter(module, name, param):
61 | old_register_parameter(module, name, param)
62 | if param is not None:
63 | param_cls = type(module._parameters[name])
64 | kwargs = module._parameters[name].__dict__
65 | module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
66 |
67 | def register_empty_buffer(module, name, buffer):
68 | old_register_buffer(module, name, buffer)
69 | if buffer is not None:
70 | module._buffers[name] = module._buffers[name].to(device)
71 | if include_buffers:
72 | tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
73 | else:
74 | tensor_constructors_to_patch = {}
75 |
76 | def patch_tensor_constructor(fn):
77 |
78 | def wrapper(*args, **kwargs):
79 | kwargs['device'] = device
80 | return fn(*args, **kwargs)
81 | return wrapper
82 | try:
83 | nn.Module.register_parameter = register_empty_parameter
84 | if include_buffers:
85 | nn.Module.register_buffer = register_empty_buffer
86 | for torch_function_name in tensor_constructors_to_patch.keys():
87 | setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
88 | yield
89 | finally:
90 | nn.Module.register_parameter = old_register_parameter
91 | if include_buffers:
92 | nn.Module.register_buffer = old_register_buffer
93 | for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
94 | setattr(torch, torch_function_name, old_torch_function)
--------------------------------------------------------------------------------
/llava/model/language_model/mpt/norm.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def _cast_if_autocast_enabled(tensor):
4 | if torch.is_autocast_enabled():
5 | if tensor.device.type == 'cuda':
6 | dtype = torch.get_autocast_gpu_dtype()
7 | elif tensor.device.type == 'cpu':
8 | dtype = torch.get_autocast_cpu_dtype()
9 | else:
10 | raise NotImplementedError()
11 | return tensor.to(dtype=dtype)
12 | return tensor
13 |
14 | class LPLayerNorm(torch.nn.LayerNorm):
15 |
16 | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
17 | super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
18 |
19 | def forward(self, x):
20 | module_device = x.device
21 | downcast_x = _cast_if_autocast_enabled(x)
22 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
23 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
24 | with torch.autocast(enabled=False, device_type=module_device.type):
25 | return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
26 |
27 | def rms_norm(x, weight=None, eps=1e-05):
28 | output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
29 | if weight is not None:
30 | return output * weight
31 | return output
32 |
33 | class RMSNorm(torch.nn.Module):
34 |
35 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
36 | super().__init__()
37 | self.eps = eps
38 | if weight:
39 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
40 | else:
41 | self.register_parameter('weight', None)
42 |
43 | def forward(self, x):
44 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
45 |
46 | class LPRMSNorm(RMSNorm):
47 |
48 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
49 | super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
50 |
51 | def forward(self, x):
52 | downcast_x = _cast_if_autocast_enabled(x)
53 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
54 | with torch.autocast(enabled=False, device_type=x.device.type):
55 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .clip_encoder import CLIPVisionTower
3 |
4 |
5 | def build_vision_tower(vision_tower_cfg, **kwargs):
6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
7 | is_absolute_path_exists = os.path.exists(vision_tower)
8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 |
11 | raise ValueError(f'Unknown vision tower: {vision_tower}')
12 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
5 |
6 |
7 | class CLIPVisionTower(nn.Module):
8 | def __init__(self, vision_tower, args, delay_load=False):
9 | super().__init__()
10 | self.is_loaded = False
11 |
12 | self.vision_tower_name = vision_tower
13 | self.select_layer = args.mm_vision_select_layer
14 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
15 |
16 | if not delay_load:
17 | self.load_model()
18 | else:
19 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
20 |
21 | def load_model(self):
22 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
23 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
24 | self.vision_tower.requires_grad_(False)
25 |
26 | self.is_loaded = True
27 |
28 | def feature_select(self, image_forward_outs, layers=[12,16,22,23]):
29 | image_feature_list = []
30 | for l in layers:
31 | image_feature_list.append(image_forward_outs.hidden_states[l])
32 | image_features_multi = torch.cat(image_feature_list, dim=2)
33 |
34 | image_features = image_forward_outs.hidden_states[self.select_layer]
35 |
36 | if self.select_feature == 'patch':
37 | image_features = image_features[:, 1:]
38 | image_features_multi = image_features_multi[:, 1:]
39 |
40 | elif self.select_feature == 'cls_patch':
41 | image_features = image_features
42 | else:
43 | raise ValueError(f'Unexpected select feature: {self.select_feature}')
44 | return image_features, image_features_multi
45 |
46 | @torch.no_grad()
47 | def forward(self, images):
48 |
49 | if type(images) is list:
50 | image_features = []
51 | for image in images:
52 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
53 | image_feature, image_feature_multi = self.feature_select(image_forward_out)
54 |
55 | image_features.append(image_feature.to(image.dtype))
56 | image_features_multi.append(image_feature_multi.to(image.dtype))
57 |
58 | else:
59 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
60 | image_features, image_features_multi = self.feature_select(image_forward_outs)
61 |
62 | return (image_features.to(images.dtype), image_features_multi.to(images.dtype))
63 |
64 | @property
65 | def dummy_feature(self):
66 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
67 |
68 | @property
69 | def dtype(self):
70 | return self.vision_tower.dtype
71 |
72 | @property
73 | def device(self):
74 | return self.vision_tower.device
75 |
76 | @property
77 | def config(self):
78 | if self.is_loaded:
79 | return self.vision_tower.config
80 | else:
81 | return self.cfg_only
82 |
83 | @property
84 | def hidden_size(self):
85 | return self.config.hidden_size
86 |
87 | @property
88 | def num_patches(self):
89 | return (self.config.image_size // self.config.patch_size) ** 2
90 |
--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import re
4 | from functools import partial
5 | import numpy as np
6 | from torch.nn.init import trunc_normal_
7 | from torch.nn import functional as F
8 | import math
9 |
10 |
11 | class IdentityMap(nn.Module):
12 | def __init__(self):
13 | super().__init__()
14 |
15 | def forward(self, x, *args, **kwargs):
16 | return x
17 |
18 | @property
19 | def config(self):
20 | return {"mm_projector_type": 'identity'}
21 |
22 |
23 | class SimpleResBlock(nn.Module):
24 | def __init__(self, channels):
25 | super().__init__()
26 | self.pre_norm = nn.LayerNorm(channels)
27 |
28 | self.proj = nn.Sequential(
29 | nn.Linear(channels, channels),
30 | nn.GELU(),
31 | nn.Linear(channels, channels)
32 | )
33 | def forward(self, x):
34 | x = self.pre_norm(x)
35 | return x + self.proj(x)
36 |
37 |
38 |
39 | class TokenPacker(nn.Module):
40 | def __init__(
41 | self,
42 | raw_grid=24,
43 | embed_dim=1024,
44 | num_heads=1024//128,
45 | kv_dim=1024,
46 | hidden_size=4096,
47 | scale_factor=2,
48 | norm_layer=partial(nn.LayerNorm, eps=1e-6)
49 | ):
50 | super().__init__()
51 | if raw_grid%scale_factor!=0:
52 | raise ValueError("scale_factor must be divisible by grid size")
53 | self.raw_grid = raw_grid
54 | self.grid_size = raw_grid//scale_factor
55 | self.num_queries = self.grid_size ** 2
56 | self.embed_dim = embed_dim
57 | self.num_heads = num_heads
58 | self.scale_factor = scale_factor
59 | self.q_proj_1 = nn.Linear(kv_dim, embed_dim, bias=False)
60 |
61 | k_modules = [nn.Linear(4096, 1024)]
62 | for _ in range(1,2):
63 | k_modules.append(nn.GELU())
64 | k_modules.append(nn.Linear(1024, 1024))
65 | self.k_proj_1 = nn.Sequential(*k_modules)
66 |
67 | v_modules = [nn.Linear(4096, 1024)]
68 | for _ in range(1,2):
69 | v_modules.append(nn.GELU())
70 | v_modules.append(nn.Linear(1024, 1024))
71 | self.v_proj_1 = nn.Sequential(*v_modules)
72 |
73 | self.ln_q_1 = norm_layer(embed_dim)
74 | self.ln_k_1 = norm_layer(embed_dim)
75 | self.ln_v_1 = norm_layer(embed_dim)
76 |
77 | self.clip_attn = nn.MultiheadAttention(embed_dim, num_heads)
78 |
79 | modules = [nn.Linear(1024, hidden_size)]
80 | for _ in range(1, 2):
81 | modules.append(nn.GELU())
82 | modules.append(nn.Linear(hidden_size, hidden_size))
83 | self.mlp = nn.Sequential(*modules)
84 |
85 | self.apply(self._init_weights)
86 |
87 | def _init_weights(self, m):
88 | if isinstance(m, nn.Linear):
89 | trunc_normal_(m.weight, std=.02)
90 | if isinstance(m, nn.Linear) and m.bias is not None:
91 | nn.init.constant_(m.bias, 0)
92 | elif isinstance(m, nn.LayerNorm):
93 | nn.init.constant_(m.bias, 0)
94 | nn.init.constant_(m.weight, 1.0)
95 |
96 | def divide_feature(self, x, kernel_size, token_num, N, c):
97 | h = w = int(token_num**0.5)
98 |
99 | reshape_x = x.reshape(h, w, N, c).reshape(h//kernel_size, kernel_size, w, N, c)
100 | reshape_x = reshape_x.permute(0,2,1,3,4)
101 | reshape_x = reshape_x.reshape(h//kernel_size, w//kernel_size, kernel_size, kernel_size, N, c)
102 | reshape_x = reshape_x.permute(0,1,3,2,4,5).reshape(h//kernel_size, w//kernel_size, kernel_size*kernel_size, N, c)
103 | reshape_x = reshape_x.permute(2,0,1,3,4).reshape(kernel_size*kernel_size, -1, c)
104 |
105 | return reshape_x
106 |
107 | def forward(self, x, attn_mask=None):
108 |
109 | x_multi = x[1] # mulit-level
110 | x = x[0] # original single-level
111 |
112 | key = self.ln_k_1(self.k_proj_1(x_multi)).permute(1, 0, 2)
113 | value = self.ln_v_1(self.v_proj_1(x_multi)).permute(1, 0, 2)
114 |
115 | token_num, N, c = key.shape
116 |
117 | q = F.interpolate(x.reshape(x.shape[0],self.raw_grid,self.raw_grid,-1).float().permute(0,3,1,2), size=(self.grid_size, self.grid_size), mode='bilinear').permute(0,2,3,1) ## fix
118 | q = q.reshape(q.shape[0], -1, q.shape[-1]).to(x.dtype)
119 |
120 | query = self.ln_q_1(self.q_proj_1(q)).permute(1, 0, 2)
121 |
122 | reshape_query = self.divide_feature(query, 1, self.num_queries, N, c)
123 | reshape_key = self.divide_feature(key, self.scale_factor, token_num, N, c)
124 | reshape_value = self.divide_feature(value, self.scale_factor, token_num, N, value.shape[-1])
125 |
126 | out = self.clip_attn(
127 | reshape_query,
128 | reshape_key,
129 | reshape_value,
130 | attn_mask=attn_mask)[0]
131 |
132 | x = out
133 | x = x.reshape(self.num_queries, N, -1)
134 | x = x.permute(1, 0, 2)
135 |
136 | x = self.mlp(x)
137 | return x
138 |
139 | def _repeat(self, query, N: int):
140 | return query.unsqueeze(1).repeat(1, N, 1)
141 |
142 |
143 |
144 | def build_vision_projector(config):
145 | return TokenPacker(hidden_size=config.hidden_size, scale_factor=config.scale_factor)
146 |
--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoConfig
2 |
3 |
4 | def auto_upgrade(config):
5 | cfg = AutoConfig.from_pretrained(config)
6 | if 'llava' in config and 'llava' not in cfg.model_type:
7 | assert cfg.model_type == 'llama'
8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 | if confirm.lower() in ["y", "yes"]:
12 | print("Upgrading checkpoint...")
13 | assert len(cfg.architectures) == 1
14 | setattr(cfg.__class__, "model_type", "llava")
15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 | cfg.save_pretrained(config)
17 | print("Checkpoint upgraded.")
18 | else:
19 | print("Checkpoint upgrade aborted.")
20 | exit(1)
21 |
--------------------------------------------------------------------------------
/llava/patch_divide.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torchvision.ops.boxes import box_area
3 |
4 | patches_9=[
5 | (1,1),
6 | (1,2),(2,1),
7 | (1,3),(3,1),
8 | (2,2),(1,4),(4,1),
9 | (1,5),(5,1),
10 | (1,6),(6,1),(2,3),(3,2),
11 | (1,7),(7,1),
12 | (4,2),(2,4),(1,8),(8,1),
13 | (3,3),(1,9),(9,1)
14 | ]
15 |
16 | patches_16=[
17 | (1,1),
18 | (1,2),(2,1),
19 | (1,3),(3,1),
20 | (2,2),(1,4),(4,1),
21 | (1,5),(5,1),
22 | (1,6),(6,1),(2,3),(3,2),
23 | (1,7),(7,1),
24 | (4,2),(2,4),(1,8),(8,1),
25 | (3,3),(1,9),(9,1),
26 | (2,5),(5,2),
27 | (2,6),(6,2),(3,4), (4,3),
28 | (2,7),(7,2),
29 | (3,5),(5,3),
30 | (2,8),(8,2),(4,4)
31 | ]
32 |
33 | patches_25=[
34 | (1,1),
35 | (1,2),(2,1),
36 | (1,3),(3,1),
37 | (2,2),(1,4),(4,1),
38 | (1,5),(5,1),
39 | (1,6),(6,1),(2,3),(3,2),
40 | (1,7),(7,1),
41 | (4,2),(2,4),(1,8),(8,1),
42 | (3,3),(1,9),(9,1),
43 | (2,5),(5,2),
44 | (2,6),(6,2),(3,4), (4,3),
45 | (2,7),(7,2),
46 | (3,5),(5,3),
47 | (2,8),(8,2),(4,4),
48 | (3,6),(6,3),(2,9),(9,2),
49 | (4,5),(5,4),(2,10),(10,2),
50 | (3,7),(7,3),
51 | (11,2),(2,11),
52 | (4,6),(6,4),(12,2),(2,12),(3,8),(8,3),(4,6),(6,4),
53 | (5,5)
54 | ]
55 |
56 |
57 | def box_iou(boxes1, area1, boxes2, eps=1e-5):
58 | area2 = box_area(boxes2)
59 |
60 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
61 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
62 |
63 | wh = (rb - lt).clamp(min=0) # [N,M,2]
64 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
65 |
66 | union = area1[:, None] + area2 - inter
67 |
68 | iou = inter / (union+eps)
69 | return iou, union
70 |
71 | class Image_Patch:
72 | def __init__(self, image_size=336, patch_num=9):
73 | if patch_num == 9:
74 | patches = patches_9
75 | elif patch_num == 16:
76 | patches = patches_16
77 | elif patch_num == 25:
78 | patches = patches_25
79 | else:
80 | raise(NotImplementedError)
81 |
82 | # h,w
83 | if isinstance(image_size, int):
84 | image_size = (image_size, image_size)
85 | self.image_size = image_size
86 |
87 | self.patch_list = patches
88 |
89 | self.patches = torch.tensor(
90 | [[0, 0, _[0]*image_size[0], _[1]*image_size[1]]
91 | for _ in patches], requires_grad=False
92 | )
93 |
94 | self.patch_areas = box_area(self.patches)
95 |
96 | def calculate(self, h, w):
97 | input_box = torch.tensor([0, 0, h, w]).unsqueeze(0)
98 | ratio = self.patches[:, 2:]/input_box[:, 2:]
99 | ratio = ratio.min(dim=-1)[0]
100 | score = torch.round(h*ratio) * torch.round(w*ratio) / self.patch_areas
101 | iou, _ = box_iou(self.patches, self.patch_areas, input_box*1.4)
102 | iou = iou[:, 0]
103 | score = score + iou*0.1
104 | idx = torch.argmax(score)
105 | return self.patch_list[idx]
--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/__init__.py
--------------------------------------------------------------------------------
/llava/serve/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 | import os
4 |
5 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
6 | from llava.conversation import conv_templates, SeparatorStyle
7 | from llava.model.builder import load_pretrained_model
8 | from llava.utils import disable_torch_init
9 | from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
10 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
11 | from llava.model import *
12 | import torch.nn.functional as F
13 | from functools import partial
14 | from llava.patch_divide import Image_Patch
15 | from torchvision.transforms import Compose, ToTensor, Normalize
16 |
17 | from PIL import Image
18 |
19 | import requests
20 | from PIL import Image
21 | from io import BytesIO
22 | from transformers import TextStreamer
23 | from functools import partial
24 | import time
25 |
26 | def main(args):
27 | # Model
28 | disable_torch_init()
29 | model_path = os.path.expanduser(args.model_path)
30 | model_name = get_model_name_from_path(model_path)
31 | tokenizer = AutoTokenizer.from_pretrained(
32 | args.model_path,
33 | model_max_length = 2048,
34 | padding_side="right",
35 | use_fast = True
36 | )
37 | model = LlavaLlamaForCausalLM.from_pretrained(
38 | args.model_path,
39 | torch_dtype=torch.bfloat16,
40 | ).cuda()
41 |
42 | for m in model.modules():
43 | m.tokenizer = tokenizer
44 |
45 | vision_tower = model.get_vision_tower()
46 | if not vision_tower.is_loaded:
47 | vision_tower.load_model()
48 | vision_tower.to(device='cuda', dtype=torch.float16)
49 | image_processor = vision_tower.image_processor
50 |
51 | patch_num = getattr(model.config, 'patch_num', '9')
52 | image_patch = Image_Patch(int(patch_num))
53 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))])
54 |
55 |
56 | while True:
57 | conv = conv_templates[args.conv_mode].copy()
58 | if "mpt" in model_name.lower():
59 | roles = ('user', 'assistant')
60 | else:
61 | roles = conv.roles
62 |
63 | image_file = input("image file: ")
64 |
65 | image = Image.open(image_file).convert('RGB')
66 |
67 | if model.config.image_aspect_ratio == 'slice':
68 | image = preprocess(image)
69 | image = image.unsqueeze(0)
70 | h, w = image.shape[-2:]
71 | block_size = 336
72 | h_block, w_block = image_patch.calculate(h, w)
73 | h_ratio = block_size*h_block/h
74 | w_ratio = block_size*w_block/w
75 | if h_ratio<=w_ratio:
76 | w_ = min(block_size*w_block, round(w*h_ratio))
77 | h_ = block_size*h_block
78 | else:
79 | w_ = block_size*w_block
80 | h_ = min(block_size*h_block, round(h*w_ratio))
81 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
82 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device)
83 | image[:, :, :h_, :w_] = image_inter
84 |
85 | split_images = []
86 | for i_ in range(h_block):
87 | for j_ in range(w_block):
88 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)]
89 | split_images.append(image_s)
90 | if len(split_images)>1:
91 | h_ratio = block_size/h
92 | w_ratio = block_size/w
93 | if h_ratio<=w_ratio:
94 | w_ = min(block_size, round(w*h_ratio))
95 | h_ = block_size
96 | else:
97 | w_ = block_size
98 | h_ = min(block_size, round(h*w_ratio))
99 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear')
100 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device)
101 | image_s[:, :, :h_, :w_] = image_inter
102 | split_images.append(image_s)
103 | image_tensor = torch.cat(split_images, dim=0)
104 | else:
105 | image_tensor = process_images([image], image_processor, model.config)[0]
106 | image_tensor = image_tensor.unsqueeze(0)
107 | h_block = 1
108 | w_block = 1
109 |
110 | try:
111 | inp = input(f"{roles[0]}: ")
112 | except EOFError:
113 | inp = ""
114 | if not inp:
115 | print("exit...")
116 | break
117 | # inp = "what is in the image?"
118 |
119 | print(f"{roles[1]}: ", end="")
120 |
121 | if image is not None:
122 | if model.config.mm_use_im_start_end:
123 | inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
124 | else:
125 | inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
126 | conv.append_message(conv.roles[0], inp)
127 | image = None
128 | else:
129 | # later messages
130 | conv.append_message(conv.roles[0], inp)
131 | conv.append_message(conv.roles[1], None)
132 | prompt = conv.get_prompt()
133 |
134 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
135 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
136 | keywords = [stop_str]
137 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
138 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
139 |
140 | mode = model.config.image_aspect_ratio
141 | with torch.inference_mode():
142 | model.orig_forward = model.forward
143 | model.forward = partial(model.orig_forward,
144 | mode=mode,
145 | h_block=h_block,
146 | w_block=w_block)
147 | start = time.time()
148 |
149 | output_ids = model.generate(
150 | input_ids,
151 | images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True),
152 | do_sample=True,
153 | temperature=args.temperature,
154 | max_new_tokens=args.max_new_tokens,
155 | streamer=streamer,
156 | use_cache=True,
157 | stopping_criteria=[stopping_criteria])
158 | model.forward = model.orig_forward
159 |
160 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
161 | end = time.time()
162 | print("***time: ", end-start)
163 | conv.messages[-1][-1] = outputs
164 |
165 | if args.debug:
166 | print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
167 |
168 |
169 | if __name__ == "__main__":
170 | parser = argparse.ArgumentParser()
171 | parser.add_argument("--model-path", type=str, default="path/to/tokenpacker")
172 | parser.add_argument("--device", type=str, default="cuda")
173 | parser.add_argument("--conv-mode", type=str, default='vicuna_v1')
174 | parser.add_argument("--temperature", type=float, default=0.2)
175 | parser.add_argument("--max-new-tokens", type=int, default=512)
176 | parser.add_argument("--load-8bit", action="store_true")
177 | parser.add_argument("--load-4bit", action="store_true")
178 | parser.add_argument("--debug", action="store_true")
179 | args = parser.parse_args()
180 | main(args)
181 |
--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/extreme_ironing.jpg
--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/waterview.jpg
--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | Manually register workers.
3 |
4 | Usage:
5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6 | """
7 |
8 | import argparse
9 |
10 | import requests
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--controller-address", type=str)
15 | parser.add_argument("--worker-name", type=str)
16 | parser.add_argument("--check-heart-beat", action="store_true")
17 | args = parser.parse_args()
18 |
19 | url = args.controller_address + "/register_worker"
20 | data = {
21 | "worker_name": args.worker_name,
22 | "check_heart_beat": args.check_heart_beat,
23 | "worker_status": None,
24 | }
25 | r = requests.post(url, json=data)
26 | assert r.status_code == 200
27 |
--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | import requests
5 |
6 | from llava.conversation import default_conversation
7 |
8 |
9 | def main():
10 | if args.worker_address:
11 | worker_addr = args.worker_address
12 | else:
13 | controller_addr = args.controller_address
14 | ret = requests.post(controller_addr + "/refresh_all_workers")
15 | ret = requests.post(controller_addr + "/list_models")
16 | models = ret.json()["models"]
17 | models.sort()
18 | print(f"Models: {models}")
19 |
20 | ret = requests.post(controller_addr + "/get_worker_address",
21 | json={"model": args.model_name})
22 | worker_addr = ret.json()["address"]
23 | print(f"worker_addr: {worker_addr}")
24 |
25 | if worker_addr == "":
26 | return
27 |
28 | conv = default_conversation.copy()
29 | conv.append_message(conv.roles[0], args.message)
30 | prompt = conv.get_prompt()
31 |
32 | headers = {"User-Agent": "LLaVA Client"}
33 | pload = {
34 | "model": args.model_name,
35 | "prompt": prompt,
36 | "max_new_tokens": args.max_new_tokens,
37 | "temperature": 0.7,
38 | "stop": conv.sep,
39 | }
40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 | json=pload, stream=True)
42 |
43 | print(prompt.replace(conv.sep, "\n"), end="")
44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 | if chunk:
46 | data = json.loads(chunk.decode("utf-8"))
47 | output = data["text"].split(conv.sep)[-1]
48 | print(output, end="\r")
49 | print("")
50 |
51 |
52 | if __name__ == "__main__":
53 | parser = argparse.ArgumentParser()
54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 | parser.add_argument("--worker-address", type=str)
56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 | parser.add_argument("--max-new-tokens", type=int, default=32)
58 | parser.add_argument("--message", type=str, default=
59 | "Tell me a story with more than 1000 words.")
60 | args = parser.parse_args()
61 |
62 | main()
63 |
--------------------------------------------------------------------------------
/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple
2 | import warnings
3 |
4 | import torch
5 |
6 | import transformers
7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
8 |
9 | try:
10 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
11 | except ImportError:
12 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
13 | from flash_attn.bert_padding import unpad_input, pad_input
14 |
15 |
16 | def forward(
17 | self,
18 | hidden_states: torch.Tensor,
19 | attention_mask: Optional[torch.Tensor] = None,
20 | position_ids: Optional[torch.Tensor] = None,
21 | past_key_value: Optional[Tuple[torch.Tensor]] = None,
22 | output_attentions: bool = False,
23 | use_cache: bool = False,
24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
25 | if output_attentions:
26 | warnings.warn(
27 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
28 | )
29 |
30 | bsz, q_len, _ = hidden_states.size()
31 |
32 | query_states = (
33 | self.q_proj(hidden_states)
34 | .view(bsz, q_len, self.num_heads, self.head_dim)
35 | .transpose(1, 2)
36 | )
37 | key_states = (
38 | self.k_proj(hidden_states)
39 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
40 | .transpose(1, 2)
41 | )
42 | value_states = (
43 | self.v_proj(hidden_states)
44 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
45 | .transpose(1, 2)
46 | ) # shape: (b, num_heads, s, head_dim)
47 |
48 | kv_seq_len = key_states.shape[-2]
49 | if past_key_value is not None:
50 | kv_seq_len += past_key_value[0].shape[-2]
51 |
52 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
53 | query_states, key_states = apply_rotary_pos_emb(
54 | query_states, key_states, cos, sin, position_ids
55 | )
56 |
57 | if past_key_value is not None:
58 | # reuse k, v
59 | key_states = torch.cat([past_key_value[0], key_states], dim=2)
60 | value_states = torch.cat([past_key_value[1], value_states], dim=2)
61 |
62 | past_key_value = (key_states, value_states) if use_cache else None
63 |
64 | # repeat k/v heads if n_kv_heads < n_heads
65 | key_states = repeat_kv(key_states, self.num_key_value_groups)
66 | value_states = repeat_kv(value_states, self.num_key_value_groups)
67 |
68 | # Transform the data into the format required by flash attention
69 | qkv = torch.stack([query_states, key_states, value_states], dim=2)
70 | qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim]
71 | key_padding_mask = attention_mask
72 |
73 | if key_padding_mask is None:
74 | qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
75 | cu_q_lens = torch.arange(
76 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
77 | )
78 | max_s = q_len
79 | output = flash_attn_unpadded_qkvpacked_func(
80 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
81 | )
82 | output = output.view(bsz, q_len, -1)
83 | else:
84 | qkv = qkv.reshape(bsz, q_len, -1)
85 | qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
86 | qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
87 | output_unpad = flash_attn_unpadded_qkvpacked_func(
88 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
89 | )
90 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
91 | output = pad_input(output_unpad, indices, bsz, q_len)
92 |
93 | return self.o_proj(output), None, past_key_value
94 |
95 |
96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
97 | # requires the attention mask to be the same as the key_padding_mask
98 | def _prepare_decoder_attention_mask(
99 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length
100 | ):
101 | # [bsz, seq_len]
102 | return attention_mask
103 |
104 |
105 | def replace_llama_attn_with_flash_attn():
106 | cuda_major, cuda_minor = torch.cuda.get_device_capability()
107 | if cuda_major < 8:
108 | warnings.warn(
109 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111 | )
112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113 | _prepare_decoder_attention_mask
114 | )
115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116 |
--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
4 |
5 | # Need to call this before importing transformers.
6 | from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
7 |
8 | replace_llama_attn_with_flash_attn()
9 |
10 | from llava.train.train import train
11 |
12 | if __name__ == "__main__":
13 | train()
14 |
--------------------------------------------------------------------------------
/llava/utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import logging.handlers
4 | import os
5 | import sys
6 |
7 | import requests
8 |
9 | from llava.constants import LOGDIR
10 |
11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
13 |
14 | handler = None
15 |
16 |
17 | def build_logger(logger_name, logger_filename):
18 | global handler
19 |
20 | formatter = logging.Formatter(
21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
22 | datefmt="%Y-%m-%d %H:%M:%S",
23 | )
24 |
25 | # Set the format of root handlers
26 | if not logging.getLogger().handlers:
27 | logging.basicConfig(level=logging.INFO)
28 | logging.getLogger().handlers[0].setFormatter(formatter)
29 |
30 | # Redirect stdout and stderr to loggers
31 | stdout_logger = logging.getLogger("stdout")
32 | stdout_logger.setLevel(logging.INFO)
33 | sl = StreamToLogger(stdout_logger, logging.INFO)
34 | sys.stdout = sl
35 |
36 | stderr_logger = logging.getLogger("stderr")
37 | stderr_logger.setLevel(logging.ERROR)
38 | sl = StreamToLogger(stderr_logger, logging.ERROR)
39 | sys.stderr = sl
40 |
41 | # Get logger
42 | logger = logging.getLogger(logger_name)
43 | logger.setLevel(logging.INFO)
44 |
45 | # Add a file handler for all loggers
46 | if handler is None:
47 | os.makedirs(LOGDIR, exist_ok=True)
48 | filename = os.path.join(LOGDIR, logger_filename)
49 | handler = logging.handlers.TimedRotatingFileHandler(
50 | filename, when='D', utc=True)
51 | handler.setFormatter(formatter)
52 |
53 | for name, item in logging.root.manager.loggerDict.items():
54 | if isinstance(item, logging.Logger):
55 | item.addHandler(handler)
56 |
57 | return logger
58 |
59 |
60 | class StreamToLogger(object):
61 | """
62 | Fake file-like stream object that redirects writes to a logger instance.
63 | """
64 | def __init__(self, logger, log_level=logging.INFO):
65 | self.terminal = sys.stdout
66 | self.logger = logger
67 | self.log_level = log_level
68 | self.linebuf = ''
69 |
70 | def __getattr__(self, attr):
71 | return getattr(self.terminal, attr)
72 |
73 | def write(self, buf):
74 | temp_linebuf = self.linebuf + buf
75 | self.linebuf = ''
76 | for line in temp_linebuf.splitlines(True):
77 | # From the io.TextIOWrapper docs:
78 | # On output, if newline is None, any '\n' characters written
79 | # are translated to the system default line separator.
80 | # By default sys.stdout.write() expects '\n' newlines and then
81 | # translates them so this is still cross platform.
82 | if line[-1] == '\n':
83 | self.logger.log(self.log_level, line.rstrip())
84 | else:
85 | self.linebuf += line
86 |
87 | def flush(self):
88 | if self.linebuf != '':
89 | self.logger.log(self.log_level, self.linebuf.rstrip())
90 | self.linebuf = ''
91 |
92 |
93 | def disable_torch_init():
94 | """
95 | Disable the redundant torch default initialization to accelerate model creation.
96 | """
97 | import torch
98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 |
101 |
102 | def violates_moderation(text):
103 | """
104 | Check whether the text violates OpenAI moderation API.
105 | """
106 | url = "https://api.openai.com/v1/moderations"
107 | headers = {"Content-Type": "application/json",
108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 | text = text.replace("\n", "")
110 | data = "{" + '"input": ' + f'"{text}"' + "}"
111 | data = data.encode("utf-8")
112 | try:
113 | ret = requests.post(url, headers=headers, data=data, timeout=5)
114 | flagged = ret.json()["results"][0]["flagged"]
115 | except requests.exceptions.RequestException as e:
116 | flagged = False
117 | except KeyError as e:
118 | flagged = False
119 |
120 | return flagged
121 |
122 |
123 | def pretty_print_semaphore(semaphore):
124 | if semaphore is None:
125 | return "None"
126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "llava"
7 | version = "1.1.3"
8 | description = "Towards GPT-4 like large language and visual assistant."
9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 | "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0", "transformers==4.31.0",
17 | "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
18 | "gradio==3.35.2", "gradio_client==0.2.9",
19 | "requests", "httpx==0.24.0", "uvicorn", "fastapi",
20 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
21 | ]
22 |
23 | [project.optional-dependencies]
24 | train = ["deepspeed==0.9.5", "ninja", "wandb"]
25 |
26 | [project.urls]
27 | "Homepage" = "https://llava-vl.github.io"
28 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
29 |
30 | [tool.setuptools.packages.find]
31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
32 |
33 | [tool.wheel]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
35 |
--------------------------------------------------------------------------------
/scripts/convert_docvqa_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 | res = json.loads(line)
13 | question_id = res['questionId']
14 | text = res['answer'].rstrip('.')
15 | all_answers.append({"questionId": question_id, "answer": text})
16 |
17 | with open(args.dst, 'w') as f:
18 | json.dump(all_answers, f)
19 |
20 |
--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 | res = json.loads(line)
13 | question_id = res['question_id']
14 | text = res['text'].rstrip('.').lower()
15 | all_answers.append({"questionId": question_id, "prediction": text})
16 |
17 | with open(args.dst, 'w') as f:
18 | json.dump(all_answers, f)
19 |
--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import pandas as pd
5 |
6 | def get_args():
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--annotation-file", type=str, required=True)
9 | parser.add_argument("--result-dir", type=str, required=True)
10 | parser.add_argument("--upload-dir", type=str, required=True)
11 | parser.add_argument("--experiment", type=str, required=True)
12 |
13 | return parser.parse_args()
14 |
15 | if __name__ == "__main__":
16 | args = get_args()
17 |
18 | df = pd.read_table(args.annotation_file)
19 |
20 | cur_df = df.copy()
21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 | cur_df.insert(6, 'prediction', None)
23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 | pred = json.loads(pred)
25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 |
27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 |
--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | cur_result = {}
11 |
12 | for line in open(args.src):
13 | data = json.loads(line)
14 | qid = data['question_id']
15 | cur_result[f'v1_{qid}'] = data['text']
16 |
17 | with open(args.dst, 'w') as f:
18 | json.dump(cur_result, f, indent=2)
19 |
--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import json
4 |
5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6 |
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--annotation-file', type=str, required=True)
11 | parser.add_argument('--result-file', type=str, required=True)
12 | parser.add_argument('--result-upload-file', type=str, required=True)
13 | return parser.parse_args()
14 |
15 |
16 | if __name__ == '__main__':
17 |
18 | args = parse_args()
19 |
20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 |
22 | results = []
23 | error_line = 0
24 | for line_idx, line in enumerate(open(args.result_file)):
25 | try:
26 | results.append(json.loads(line))
27 | except:
28 | error_line += 1
29 | results = {x['question_id']: x['text'] for x in results}
30 | test_split = [json.loads(line) for line in open(args.annotation_file)]
31 | split_ids = set([x['question_id'] for x in test_split])
32 |
33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 |
35 | all_answers = []
36 |
37 | answer_processor = EvalAIAnswerProcessor()
38 |
39 | for x in test_split:
40 | assert x['question_id'] in results
41 | all_answers.append({
42 | 'image': x['image'],
43 | 'answer': answer_processor(results[x['question_id']])
44 | })
45 |
46 | with open(args.result_upload_file, 'w') as f:
47 | json.dump(all_answers, f)
48 |
--------------------------------------------------------------------------------
/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import json
4 |
5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6 |
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 | parser.add_argument('--ckpt', type=str, required=True)
12 | parser.add_argument('--split', type=str, required=True)
13 | return parser.parse_args()
14 |
15 |
16 | if __name__ == '__main__':
17 |
18 | args = parse_args()
19 |
20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 | os.makedirs(os.path.dirname(dst), exist_ok=True)
24 |
25 | results = []
26 | error_line = 0
27 | for line_idx, line in enumerate(open(src)):
28 | try:
29 | results.append(json.loads(line))
30 | except:
31 | error_line += 1
32 |
33 | results = {x['question_id']: x['text'] for x in results}
34 | test_split = [json.loads(line) for line in open(test_split)]
35 | split_ids = set([x['question_id'] for x in test_split])
36 |
37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 |
39 | all_answers = []
40 |
41 | answer_processor = EvalAIAnswerProcessor()
42 |
43 | for x in test_split:
44 | if x['question_id'] not in results:
45 | all_answers.append({
46 | 'question_id': x['question_id'],
47 | 'answer': ''
48 | })
49 | else:
50 | all_answers.append({
51 | 'question_id': x['question_id'],
52 | 'answer': answer_processor(results[x['question_id']])
53 | })
54 |
55 | with open(dst, 'w') as f:
56 | json.dump(all_answers, open(dst, 'w'))
57 |
--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
1 | """
2 | This is just a utility that I use to extract the projector for quantized models.
3 | It is NOT necessary at all to train, or run inference/serve demos.
4 | Use this script ONLY if you fully understand its implications.
5 | """
6 |
7 |
8 | import os
9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 |
14 |
15 | def parse_args():
16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 | parser.add_argument('--model-path', type=str, help='model folder')
18 | parser.add_argument('--output', type=str, help='output file')
19 | args = parser.parse_args()
20 | return args
21 |
22 |
23 | if __name__ == '__main__':
24 | args = parse_args()
25 |
26 | keys_to_match = ['mm_projector']
27 | ckpt_to_key = defaultdict(list)
28 | try:
29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 | for k, v in model_indices['weight_map'].items():
31 | if any(key_match in k for key_match in keys_to_match):
32 | ckpt_to_key[v].append(k)
33 | except FileNotFoundError:
34 | # Smaller models or model checkpoints saved by DeepSpeed.
35 | v = 'pytorch_model.bin'
36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 | if any(key_match in k for key_match in keys_to_match):
38 | ckpt_to_key[v].append(k)
39 |
40 | loaded_weights = {}
41 |
42 | for ckpt_name, weight_keys in ckpt_to_key.items():
43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 | for k in weight_keys:
45 | loaded_weights[k] = ckpt[k]
46 |
47 | torch.save(loaded_weights, args.output)
48 |
--------------------------------------------------------------------------------
/scripts/finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4 |
5 | # Uncomment and set the following variables correspondingly to run this script:
6 |
7 | ################## VICUNA ##################
8 | # PROMPT_VERSION=v1
9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 |
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 |
17 | deepspeed llava/train/train_mem.py \
18 | --deepspeed ./scripts/zero2.json \
19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 | --version $PROMPT_VERSION \
21 | --data_path ./playground/data/llava_instruct_80k.json \
22 | --image_folder /path/to/coco/train2017 \
23 | --vision_tower openai/clip-vit-large-patch14 \
24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 | --mm_vision_select_layer -2 \
26 | --mm_use_im_start_end False \
27 | --mm_use_im_patch_token False \
28 | --bf16 True \
29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 | --num_train_epochs 1 \
31 | --per_device_train_batch_size 16 \
32 | --per_device_eval_batch_size 4 \
33 | --gradient_accumulation_steps 1 \
34 | --evaluation_strategy "no" \
35 | --save_strategy "steps" \
36 | --save_steps 50000 \
37 | --save_total_limit 1 \
38 | --learning_rate 2e-5 \
39 | --weight_decay 0. \
40 | --warmup_ratio 0.03 \
41 | --lr_scheduler_type "cosine" \
42 | --logging_steps 1 \
43 | --tf32 True \
44 | --model_max_length 2048 \
45 | --gradient_checkpointing True \
46 | --dataloader_num_workers 4 \
47 | --lazy_preprocess True \
48 | --report_to wandb
49 |
--------------------------------------------------------------------------------
/scripts/finetune_full_schedule.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4 |
5 | # Uncomment and set the following variables correspondingly to run this script:
6 |
7 | ################## VICUNA ##################
8 | # PROMPT_VERSION=v1
9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 |
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 |
17 | deepspeed llava/train/train_mem.py \
18 | --deepspeed ./scripts/zero2.json \
19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 | --version $PROMPT_VERSION \
21 | --data_path ./playground/data/llava_instruct_158k.json \
22 | --image_folder /path/to/coco/train2017 \
23 | --vision_tower openai/clip-vit-large-patch14 \
24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 | --mm_vision_select_layer -2 \
26 | --mm_use_im_start_end False \
27 | --mm_use_im_patch_token False \
28 | --bf16 True \
29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 | --num_train_epochs 3 \
31 | --per_device_train_batch_size 16 \
32 | --per_device_eval_batch_size 4 \
33 | --gradient_accumulation_steps 1 \
34 | --evaluation_strategy "no" \
35 | --save_strategy "steps" \
36 | --save_steps 50000 \
37 | --save_total_limit 1 \
38 | --learning_rate 2e-5 \
39 | --weight_decay 0. \
40 | --warmup_ratio 0.03 \
41 | --lr_scheduler_type "cosine" \
42 | --logging_steps 1 \
43 | --tf32 True \
44 | --model_max_length 2048 \
45 | --gradient_checkpointing True \
46 | --dataloader_num_workers 4 \
47 | --lazy_preprocess True \
48 | --report_to wandb
49 |
--------------------------------------------------------------------------------
/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4 |
5 | # Uncomment and set the following variables correspondingly to run this script:
6 |
7 | ################## VICUNA ##################
8 | # PROMPT_VERSION=v1
9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 |
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 |
17 | deepspeed llava/train/train_mem.py \
18 | --deepspeed ./scripts/zero2.json \
19 | --lora_enable True \
20 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
21 | --version $PROMPT_VERSION \
22 | --data_path ./playground/data/llava_instruct_80k.json \
23 | --image_folder /path/to/coco/train2017 \
24 | --vision_tower openai/clip-vit-large-patch14 \
25 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
26 | --mm_vision_select_layer -2 \
27 | --mm_use_im_start_end False \
28 | --mm_use_im_patch_token False \
29 | --bf16 True \
30 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
31 | --num_train_epochs 1 \
32 | --per_device_train_batch_size 16 \
33 | --per_device_eval_batch_size 4 \
34 | --gradient_accumulation_steps 1 \
35 | --evaluation_strategy "no" \
36 | --save_strategy "steps" \
37 | --save_steps 50000 \
38 | --save_total_limit 1 \
39 | --learning_rate 2e-5 \
40 | --weight_decay 0. \
41 | --warmup_ratio 0.03 \
42 | --lr_scheduler_type "cosine" \
43 | --logging_steps 1 \
44 | --tf32 True \
45 | --model_max_length 2048 \
46 | --gradient_checkpointing True \
47 | --lazy_preprocess True \
48 | --dataloader_num_workers 4 \
49 | --report_to wandb
50 |
--------------------------------------------------------------------------------
/scripts/finetune_qlora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4 |
5 | # Uncomment and set the following variables correspondingly to run this script:
6 |
7 | ################## VICUNA ##################
8 | # PROMPT_VERSION=v1
9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 |
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 |
17 | deepspeed llava/train/train_mem.py \
18 | --deepspeed ./scripts/zero2.json \
19 | --lora_enable True \
20 | --bits 4 \
21 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
22 | --version $PROMPT_VERSION \
23 | --data_path ./playground/data/llava_instruct_80k.json \
24 | --image_folder /path/to/coco/train2017 \
25 | --vision_tower openai/clip-vit-large-patch14 \
26 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
27 | --mm_vision_select_layer -2 \
28 | --mm_use_im_start_end False \
29 | --mm_use_im_patch_token False \
30 | --bf16 True \
31 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
32 | --num_train_epochs 1 \
33 | --per_device_train_batch_size 16 \
34 | --per_device_eval_batch_size 4 \
35 | --gradient_accumulation_steps 1 \
36 | --evaluation_strategy "no" \
37 | --save_strategy "steps" \
38 | --save_steps 50000 \
39 | --save_total_limit 1 \
40 | --learning_rate 2e-5 \
41 | --weight_decay 0. \
42 | --warmup_ratio 0.03 \
43 | --lr_scheduler_type "cosine" \
44 | --logging_steps 1 \
45 | --tf32 True \
46 | --model_max_length 2048 \
47 | --gradient_checkpointing True \
48 | --lazy_preprocess True \
49 | --dataloader_num_workers 4 \
50 | --report_to wandb
51 |
--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from llava.model.builder import load_pretrained_model
3 | from llava.mm_utils import get_model_name_from_path
4 |
5 |
6 | def merge_lora(args):
7 | model_name = get_model_name_from_path(args.model_path)
8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
9 |
10 | model.save_pretrained(args.save_model_path)
11 | tokenizer.save_pretrained(args.save_model_path)
12 |
13 |
14 | if __name__ == "__main__":
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--model-path", type=str, required=True)
17 | parser.add_argument("--model-base", type=str, required=True)
18 | parser.add_argument("--save-model-path", type=str, required=True)
19 |
20 | args = parser.parse_args()
21 |
22 | merge_lora(args)
23 |
--------------------------------------------------------------------------------
/scripts/pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
4 |
5 | # Uncomment and set the following variables correspondingly to run this script:
6 |
7 | # MODEL_VERSION=vicuna-v1-3-7b
8 | # MODEL_VERSION=llama-2-7b-chat
9 |
10 | ########### DO NOT CHANGE ###########
11 | ########### USE THIS FOR BOTH ###########
12 | PROMPT_VERSION=plain
13 | ########### DO NOT CHANGE ###########
14 |
15 | deepspeed llava/train/train_mem.py \
16 | --deepspeed ./scripts/zero2.json \
17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
18 | --version $PROMPT_VERSION \
19 | --data_path /path/to/pretrain_data.json \
20 | --image_folder /path/to/images \
21 | --vision_tower openai/clip-vit-large-patch14 \
22 | --tune_mm_mlp_adapter True \
23 | --mm_vision_select_layer -2 \
24 | --mm_use_im_start_end False \
25 | --mm_use_im_patch_token False \
26 | --bf16 True \
27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
28 | --num_train_epochs 1 \
29 | --per_device_train_batch_size 16 \
30 | --per_device_eval_batch_size 4 \
31 | --gradient_accumulation_steps 1 \
32 | --evaluation_strategy "no" \
33 | --save_strategy "steps" \
34 | --save_steps 24000 \
35 | --save_total_limit 1 \
36 | --learning_rate 2e-3 \
37 | --weight_decay 0. \
38 | --warmup_ratio 0.03 \
39 | --lr_scheduler_type "cosine" \
40 | --logging_steps 1 \
41 | --tf32 True \
42 | --model_max_length 2048 \
43 | --gradient_checkpointing True \
44 | --dataloader_num_workers 4 \
45 | --lazy_preprocess True \
46 | --report_to wandb
47 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/docvqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT="llava-tokenpacker-7b"
9 |
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.eval_docvqa \
12 | --model-path llava-tokenpacker-7b \
13 | --question-file ./playground/data/eval/docvqa/data/test_v1.0.json \
14 | --image-folder /path/to/docvqa/images \
15 | --answers-file ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 | --num-chunks $CHUNKS \
17 | --chunk-idx $IDX \
18 | --temperature 0 \
19 | --conv-mode vicuna_v1 &
20 | done
21 |
22 | wait
23 |
24 | output_file=./playground/data/eval/docvqa/answers/$CKPT/merge.jsonl
25 |
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 |
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 | cat ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 |
34 | python scripts/convert_docvqa_for_eval.py --src $output_file --dst ./playground/data/eval/docvqa/answers/$CKPT/submit.json
35 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT="llava-tokenpacker-7b"
9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path llava-tokenpacker-7b \
15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
16 | --image-folder /path/to/gqa/images \
17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --conv-mode vicuna_v1 &
22 | done
23 |
24 | wait
25 |
26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
27 |
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 |
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 |
36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
37 |
38 | cd $GQADIR
39 | python eval/eval.py --tier testdev_balanced
40 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SPLIT="mmbench_dev_20230712"
4 |
5 | python -m llava.eval.model_vqa_mmbench \
6 | --model-path llava-tokenpacker-7b \
7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-tokenpacker-7b.jsonl \
9 | --single-pred-prompt \
10 | --temperature 0 \
11 | --conv-mode vicuna_v1
12 |
13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
14 |
15 | python scripts/convert_mmbench_for_submission.py \
16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
19 | --experiment llava-tokenpacker-7b
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | NAME=llava-tokenpacker-7b
3 |
4 | python -m llava.eval.model_vqa_loader \
5 | --model-path llava-tokenpacker-7b \
6 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \
7 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
8 | --answers-file ./playground/data/eval/MME/answers/$NAME.jsonl \
9 | --temperature 0 \
10 | --conv-mode vicuna_v1
11 |
12 | cd ./playground/data/eval/MME
13 |
14 | python convert_answer_to_mme.py --experiment $NAME
15 |
16 | cd eval_tool
17 |
18 | python calculation.py --results_dir answers/$NAME
19 |
20 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmmu_val.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
6 |
7 | CHUNKS=${#GPULIST[@]}
8 |
9 | CKPT="llava-tokenpacker-7b"
10 | CONFIG="llava/eval/mmmu/eval/configs/llava1.5.yaml"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python llava/eval/mmmu/eval/run_llava.py \
14 | --data_path /path/to/MMMU \
15 | --config_path $CONFIG \
16 | --model_path llava-tokenpacker-7b \
17 | --answers-file ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --split "validation" \
21 | --conv-mode vicuna_v1 & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 |
24 | wait
25 |
26 | output_file=./playground/data/eval/MMMU/answers/$CKPT/merge.jsonl
27 |
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 |
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 | cat ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 |
36 | python llava/eval/mmmu/eval/eval.py --result_file $output_file --output_path ./playground/data/eval/MMMU/$CKPT/val.json
37 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa \
4 | --model-path llava-tokenpacker-7b \
5 | --question-file /path/to/llava-mm-vet.jsonl \
6 | --image-folder /path/to/mm-vet/images \
7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | mkdir -p ./playground/data/eval/mm-vet/results
12 |
13 | python scripts/convert_mmvet_for_eval.py \
14 | --src ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \
15 | --dst ./playground/data/eval/mm-vet/results/llava-tokenpacker-7b.json
16 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/ocr_bench.sh:
--------------------------------------------------------------------------------
1 |
2 | python -m llava.eval.eval_ocr_bench \
3 | --model_path llava-tokenpacker-7b \
4 | --image_folder /path/to/OCR-Bench/OCRBench_Images \
5 | --output_folder ./playground/data/eval/ocr_bench \
6 | --OCRBench_file /path/to/OCRBench.json \
7 | --save_name llava-tokenpacker-7b \
8 | --temperature 0 \
9 | --conv_mode vicuna_v1
10 |
11 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | NAME="llava-tokenpacker-7b"
5 |
6 | python -m llava.eval.model_vqa_loader_pope \
7 | --model-path llava-tokenpacker-7b \
8 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
9 | --image-folder /path/tp/coco_imgs \
10 | --answers-file ./playground/data/eval/pope/answers/$NAME.jsonl \
11 | --temperature 0 \
12 | --conv-mode vicuna_v1
13 |
14 | python llava/eval/eval_pope.py \
15 | --annotation-dir ./playground/data/eval/pope/coco \
16 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
17 | --result-file ./playground/data/eval/pope/answers/$NAME.jsonl
18 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.model_vqa_loader \
4 | --model-path llava-tokenpacker-7b \
5 | --question-file /path/tp/llava_textvqa_val_v051_ocr.jsonl \
6 | --image-folder /path/tp/textvqa/train_images \
7 | --answers-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.eval_textvqa \
12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
13 | --result-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl
14 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa_loader \
4 | --model-path llava-tokenpacker-7b\
5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
6 | --image-folder /path/to/vizwiz/test \
7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | python scripts/convert_vizwiz_for_submission.py \
12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
13 | --result-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \
14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-tokenpacker-7b.json
15 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT="llava-tokenpacker-7b"
9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 |
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 | --model-path llava-tokenpacker-7b \
14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 | --image-folder /path/to/VQAv2/test2015/ \
16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 | --num-chunks $CHUNKS \
18 | --chunk-idx $IDX \
19 | --temperature 0 \
20 | --conv-mode vicuna_v1 &
21 | done
22 |
23 | wait
24 |
25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
26 |
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 |
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 |
35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
36 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path vicuna-7b-v1.5 \
6 | --version v1 \
7 | --data_path /path/to/data/llava_v1_5_mix665k.json \
8 | --image_folder ./data/llava_mix665k \
9 | --vision_tower ./clip-vit-large-patch14-336 \
10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain/mm_projector.bin \
11 | --mm_projector_type tokenpacker \
12 | --scale_factor 2 \
13 | --mm_vision_select_layer -2 \
14 | --mm_use_im_start_end False \
15 | --mm_use_im_patch_token False \
16 | --image_aspect_ratio pad \
17 | --group_by_modality_length True \
18 | --bf16 True \
19 | --output_dir ./checkpoints/llava-tokenpacker \
20 | --num_train_epochs 1 \
21 | --per_device_train_batch_size 16 \
22 | --per_device_eval_batch_size 4 \
23 | --gradient_accumulation_steps 1 \
24 | --evaluation_strategy "no" \
25 | --save_strategy "steps" \
26 | --save_steps 50000 \
27 | --save_total_limit 1 \
28 | --learning_rate 2e-5 \
29 | --weight_decay 0. \
30 | --warmup_ratio 0.03 \
31 | --lr_scheduler_type "cosine" \
32 | --logging_steps 1 \
33 | --tf32 True \
34 | --model_max_length 2048 \
35 | --gradient_checkpointing True \
36 | --dataloader_num_workers 4 \
37 | --lazy_preprocess True \
38 | --report_to "none"
39 |
40 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune_hd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path vicuna-7b-v1.5 \
6 | --version v1 \
7 | --data_path /path/to/mgm_instruction.json \
8 | --image_folder ./data/MGM-Finetune \
9 | --vision_tower ./clip-vit-large-patch14-336 \
10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain-hd/mm_projector.bin \
11 | --mm_projector_type tokenpacker \
12 | --patch_num 9 \
13 | --scale_factor 2 \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --image_aspect_ratio slice \
18 | --group_by_modality_length True \
19 | --bf16 True \
20 | --output_dir ./checkpoints/llava-tokenpacker-hd \
21 | --num_train_epochs 1 \
22 | --per_device_train_batch_size 16 \
23 | --per_device_eval_batch_size 4 \
24 | --gradient_accumulation_steps 1 \
25 | --evaluation_strategy "no" \
26 | --save_strategy "steps" \
27 | --save_steps 50000 \
28 | --save_total_limit 1 \
29 | --learning_rate 2e-5 \
30 | --weight_decay 0. \
31 | --warmup_ratio 0.03 \
32 | --lr_scheduler_type "cosine" \
33 | --logging_steps 1 \
34 | --tf32 True \
35 | --model_max_length 2048 \
36 | --gradient_checkpointing True \
37 | --dataloader_num_workers 4 \
38 | --lazy_preprocess True \
39 | --report_to "none"
40 |
41 |
--------------------------------------------------------------------------------
/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path vicuna-7b-v1.5 \
6 | --version plain \
7 | --data_path /path/to/blip_laion_cc_sbu_558k.json \
8 | --image_folder ./data/llava_pretrain_558k \
9 | --vision_tower ./clip-vit-large-patch14-336 \
10 | --mm_projector_type tokenpacker \
11 | --scale_factor 2 \
12 | --tune_mm_mlp_adapter True \
13 | --mm_vision_select_layer -2 \
14 | --mm_use_im_start_end False \
15 | --mm_use_im_patch_token False \
16 | --bf16 True \
17 | --output_dir ./checkpoints/llava-tokenpacker-pretrain/ \
18 | --num_train_epochs 1 \
19 | --per_device_train_batch_size 32 \
20 | --per_device_eval_batch_size 4 \
21 | --gradient_accumulation_steps 1 \
22 | --evaluation_strategy "no" \
23 | --save_strategy "steps" \
24 | --save_steps 24000 \
25 | --save_total_limit 1 \
26 | --learning_rate 1e-3 \
27 | --weight_decay 0. \
28 | --warmup_ratio 0.03 \
29 | --lr_scheduler_type "cosine" \
30 | --logging_steps 1 \
31 | --tf32 True \
32 | --model_max_length 2048 \
33 | --gradient_checkpointing True \
34 | --dataloader_num_workers 4 \
35 | --lazy_preprocess True \
36 | --report_to "none"
37 |
38 |
--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_hd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path vicuna-7b-v1.5 \
6 | --version plain \
7 | --data_path /path/to/mgm_pretrain.json \
8 | --image_folder ./data/llava_pretrain_558k \
9 | --vision_tower ./clip-vit-large-patch14-336 \
10 | --mm_projector_type tokenpacker \
11 | --patch_num 9 \
12 | --scale_factor 2 \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --image_aspect_ratio slice \
18 | --bf16 True \
19 | --output_dir ./checkpoints/llava-tokenpacker-pretrain-hd/ \
20 | --num_train_epochs 1 \
21 | --per_device_train_batch_size 32 \
22 | --per_device_eval_batch_size 4 \
23 | --gradient_accumulation_steps 1 \
24 | --evaluation_strategy "no" \
25 | --save_strategy "steps" \
26 | --save_steps 24000 \
27 | --save_total_limit 1 \
28 | --learning_rate 1e-3 \
29 | --weight_decay 0. \
30 | --warmup_ratio 0.03 \
31 | --lr_scheduler_type "cosine" \
32 | --logging_steps 1 \
33 | --tf32 True \
34 | --model_max_length 2048 \
35 | --gradient_checkpointing True \
36 | --dataloader_num_workers 4 \
37 | --lazy_preprocess True \
38 | --report_to "none"
39 |
40 |
--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 2,
18 | "overlap_comm": true,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto"
22 | }
23 | }
--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 3,
18 | "overlap_comm": true,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto",
22 | "stage3_prefetch_bucket_size": "auto",
23 | "stage3_param_persistence_threshold": "auto",
24 | "stage3_max_live_parameters": 1e9,
25 | "stage3_max_reuse_distance": 1e9,
26 | "stage3_gather_16bit_weights_on_model_save": true
27 | }
28 | }
--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupLR",
24 | "params": {
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 | "zero_optimization": {
31 | "stage": 3,
32 | "offload_optimizer": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "offload_param": {
37 | "device": "cpu",
38 | "pin_memory": true
39 | },
40 | "overlap_comm": true,
41 | "contiguous_gradients": true,
42 | "sub_group_size": 1e9,
43 | "reduce_bucket_size": "auto",
44 | "stage3_prefetch_bucket_size": "auto",
45 | "stage3_param_persistence_threshold": "auto",
46 | "stage3_max_live_parameters": 1e9,
47 | "stage3_max_reuse_distance": 1e9,
48 | "gather_16bit_weights_on_model_save": true
49 | },
50 | "gradient_accumulation_steps": "auto",
51 | "gradient_clipping": "auto",
52 | "train_batch_size": "auto",
53 | "train_micro_batch_size_per_gpu": "auto",
54 | "steps_per_print": 1e5,
55 | "wall_clock_breakdown": false
56 | }
--------------------------------------------------------------------------------