├── README.md ├── assets ├── Algorithm.png ├── compare.png ├── ex1.png ├── ex2.jpg ├── framework.png ├── hd.png ├── high-reso.jpg ├── projector_comparsion.jpg ├── title.png ├── vis-1.jpg └── vis-2.jpg ├── docs └── evaluation.md ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_docvqa.py │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_ocr_bench.py │ ├── eval_pope.py │ ├── eval_textvqa.py │ ├── m4c_evaluator.py │ ├── mmmu │ │ └── eval │ │ │ ├── README.md │ │ │ ├── answer_dict_val.json │ │ │ ├── configs │ │ │ └── llava1.5.yaml │ │ │ ├── convert_to_test.py │ │ │ ├── eval.py │ │ │ ├── main_eval_only.py │ │ │ ├── main_parse_and_eval.py │ │ │ ├── print_results.py │ │ │ ├── run_llava.py │ │ │ └── utils │ │ │ ├── __pycache__ │ │ │ ├── data_utils.cpython-310.pyc │ │ │ ├── eval_utils.cpython-310.pyc │ │ │ └── model_utils_ind.cpython-310.pyc │ │ │ ├── data_utils.py │ │ │ ├── eval_utils.py │ │ │ ├── model_utils.py │ │ │ └── model_utils_ind.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_loader_pope.py │ ├── model_vqa_mmbench.py │ ├── run_llava.py │ └── summarize_gpt_review.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_mpt.py │ │ └── mpt │ │ │ ├── __pycache__ │ │ │ ├── adapt_tokenizer.cpython-310.pyc │ │ │ ├── attention.cpython-310.pyc │ │ │ ├── blocks.cpython-310.pyc │ │ │ ├── configuration_mpt.cpython-310.pyc │ │ │ ├── custom_embedding.cpython-310.pyc │ │ │ ├── flash_attn_triton.cpython-310.pyc │ │ │ ├── hf_prefixlm_converter.cpython-310.pyc │ │ │ ├── meta_init_context.cpython-310.pyc │ │ │ ├── modeling_mpt.cpython-310.pyc │ │ │ ├── norm.cpython-310.pyc │ │ │ └── param_init_fns.cpython-310.pyc │ │ │ ├── adapt_tokenizer.py │ │ │ ├── attention.py │ │ │ ├── blocks.py │ │ │ ├── configuration_mpt.py │ │ │ ├── custom_embedding.py │ │ │ ├── flash_attn_triton.py │ │ │ ├── hf_prefixlm_converter.py │ │ │ ├── meta_init_context.py │ │ │ ├── modeling_mpt.py │ │ │ ├── norm.py │ │ │ └── param_init_fns.py │ ├── llava_arch.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── patch_divide.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ └── train_mem.py └── utils.py ├── pyproject.toml └── scripts ├── convert_docvqa_for_eval.py ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── extract_mm_projector.py ├── finetune.sh ├── finetune_full_schedule.sh ├── finetune_lora.sh ├── finetune_qlora.sh ├── merge_lora_weights.py ├── pretrain.sh ├── v1_5 ├── eval │ ├── docvqa.sh │ ├── gqa.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu_val.sh │ ├── mmvet.sh │ ├── ocr_bench.sh │ ├── pope.sh │ ├── textvqa.sh │ ├── vizwiz.sh │ └── vqav2.sh ├── finetune.sh ├── finetune_hd.sh ├── pretrain.sh └── pretrain_hd.sh ├── zero2.json ├── zero3.json └── zero3_offload.json /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | 4 |

5 | 6 | 7 |
8 | 9 | TokenPacker-v1 10 | 11 | 12 | arXiv 13 | 14 | 15 | HF Model 16 | 17 | 18 | ZhiHu 19 | 20 |
21 | 22 | 23 | --- 24 | 25 | ## Comparisons with existing methods 💡 26 | 27 |

28 | 29 |

30 | 31 | ## Updates 📌 32 | - [2025/5/23] TokenPacker is accepted by **IJCV** 🎉🎉🎉. 33 | - [2024/10/22] We integrated TokenPacker-HD framework with [Osprey](https://github.com/CircleRadon/Osprey) to achieve fine-grained high-resolution pixel-level understanding with large performance gains. Please see the codes in this [branch](https://github.com/CircleRadon/TokenPacker/tree/tokenpacker-hd-osprey) for your reference. 34 | - [2024/7/25] We released [checkpoints](https://huggingface.co/collections/sunshine-lwt/tokenpacker-66a234618f0d2327e0cf2cb1), please check them. 35 | - [2024/7/3] We released the [paper](https://arxiv.org/abs/2407.02392) of our TokenPacker on Arxiv. 36 | - [2024/7/3] We released the training and inference codes. 37 | 38 | 39 | ## What is TokenPacker 👀 40 | TokenPacker is a novel visual projector, which adopts a `coarse-to-fine` scheme 41 | to inject the enriched characteristics to generate the condensed visual tokens. Using TokenPacker, we can compress the 42 | visual tokens by **75%∼89%**, while achieves comparable or even better performance 43 | across diverse benchmarks with significantly higher efficiency. 44 | 45 | 46 | #### Algorithms 47 | We provide the pseudo-codes to showcase the detailed processing flow. 48 | 49 | 50 | #### Core codes 51 | As a visual projector, TokenPacker is implemented by a `class TokenPacker`, which can be found in [multimodal_projector/builder.py](./llava/model/multimodal_projector/builder.py#L39) 52 | 53 | #### Comparisons with various projectors 54 | 55 | 56 | 57 | ## High-Resolution Image Understanding with TokenPacker 🔬 58 | To support efficient `high-resolution` image understanding, we further develop an effective image 59 | cropping method `TokenPacker-HD`. 60 | 61 | 62 | 63 | ## Install 🛠️ 64 | 1. Clone this repository and navigate to TokenPacker folder 65 | ``` 66 | git clone https://github.com/CircleRadon/TokenPacker.git 67 | cd TokenPacker 68 | ``` 69 | 2. Install packages 70 | ``` 71 | conda create -n tokenpacker python=3.10 -y 72 | conda activate tokenpacker 73 | pip install --upgrade pip # enable PEP 660 support 74 | pip install -e . 75 | ``` 76 | 3. Install additional packages for training cases 77 | ``` 78 | pip install -e ".[train]" 79 | pip install flash-attn --no-build-isolation 80 | ``` 81 | 82 | ## Training 🚀 83 | 84 | ### LLaVA-TokenPacker 85 | 86 | #### Dataset 87 | To make a fair comparison, we use the same training data as in [LLaVA-1.5](https://github.com/haotian-liu/LLaVA), i.e., [LLaVA-Pretrain-558K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/tree/main) for stage 1, and [Mix665k](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main) for stage 2. 88 | 89 | #### Training 90 | - Stage1: Image-Text Alignment Pre-training 91 | ```shell 92 | bash scripts/v1_5/pretrain.sh 93 | ``` 94 | - Stage2: Visual Instruction Tuning 95 | ```shell 96 | bash scripts/v1_5/finetune.sh 97 | ``` 98 | Note: Using `--scale_factor` to control compression ratio, support [2,3,4] 99 | 100 | ### LLaVA-TokenPacker-HD 101 | 102 | #### Dataset 103 | To obtain the competitive high-resolution performance, we use 2.7M data as organized by [Mini-Gemini](https://github.com/dvlab-research/MGM#Dataset), i.e., 1.2M for stage 1 and 1.5M for stage 2. 104 | 105 | #### Training 106 | - Stage1: Image-Text Alignment Pre-training 107 | ```shell 108 | bash scripts/v1_5/pretrain_hd.sh 109 | ``` 110 | - Stage2: Visual Instruction Tuning 111 | ```shell 112 | bash scripts/v1_5/finetune_hd.sh 113 | ``` 114 | 115 | Note: 116 | - Using `--scale_factor` to control compression ratio, support [2,3,4]. 117 | - Using `--patch_num` to control max patch dividing number, support [9,16,25]. 118 | 119 | 120 | ## Experiments 121 | 122 | 123 | 124 | 125 | 126 | 127 | ## Model Zoo 128 | 129 | | Model | Max Res. | Compre. Ratio | Token Num. | Max Patch Num. | Training Data | Download | 130 | |--------------------|:-----------:|:---------------:|:------------:|:----------------:|:--------------------------------------------------------------------------------------------------:|---------------------------------------------------------------------------------------| 131 | | TokenPacker-7b | 336x336 | 1/4 | 144 | - | 558K+665K | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-7b-144token/tree/main) | 132 | | TokenPacker-13b | 336x336 | 1/4 | 144 | - | 558K+665K | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-13b-144token/tree/main) | 133 | | TokenPacker-HD-7b | 1088x1088 | 1/4 | ~954 | 9 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-7b-9patch-144token/tree/main) | 134 | | TokenPacker-HD-13b | 1088x1088 | 1/4 | ~954 | 9 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-9patch-144token/tree/main) | 135 | | TokenPacker-HD-13b | 1344x1344 | 1/4 | ~1393 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-144token/tree/main) | 136 | | TokenPacker-HD-13b | 1344x1344 | 1/9 | ~619 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-64token/tree/main) | 137 | | TokenPacker-HD-13b | 1344x1344 | 1/16 | ~347 | 16 | 1.2M+1.5M | [checkpoints](https://huggingface.co/sunshine-lwt/TokenPacker-HD-13b-16patch-36token/tree/main) | 138 | 139 | Note: 140 | - The `token number` of TokenPacker-HD is the `average` statistically across all training and test data. 141 | - The training data of `558K+665K` follows LLaVA-1.5, the one of `1.2M+1.5M` follows Mini-Gemini. 142 | - All LLMs use Vicuna-7b/13b as based LLM. 143 | 144 | 145 | ## Visualization 146 | We provide some visual examples. 147 | 148 | 149 | 150 | 151 | High-resolution image understanding. 152 | 153 | 154 | 155 | ## TODO List 📝 156 | - [x] Release the training and inference codes. 157 | - [x] Release all checkpoints. 158 | 159 | 160 | ## Acknowledgement 💌 161 | - [LLaVA-v1.5](https://github.com/haotian-liu/LLaVA): the codebase we built upon. 162 | - [Mini-Gemini](https://github.com/dvlab-research/MGM): the organized data we used for training high-resolution method. 163 | 164 | ## More ## 165 | For more recent related works, please refer to this repo of [Awesome-Token-Compress](https://github.com/daixiangzi/Awesome-Token-Compress). 166 | 167 | ## BibTeX 🖊️ 168 | ``` 169 | @misc{TokenPacker, 170 | title={TokenPacker: Efficient Visual Projector for Multimodal LLM}, 171 | author={Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jianke Zhu and Lei Zhang}, 172 | year={2024}, 173 | eprint={2407.02392}, 174 | archivePrefix={arXiv}, 175 | primaryClass={cs.CV} 176 | } 177 | ``` 178 | -------------------------------------------------------------------------------- /assets/Algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/Algorithm.png -------------------------------------------------------------------------------- /assets/compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/compare.png -------------------------------------------------------------------------------- /assets/ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex1.png -------------------------------------------------------------------------------- /assets/ex2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/ex2.jpg -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/framework.png -------------------------------------------------------------------------------- /assets/hd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/hd.png -------------------------------------------------------------------------------- /assets/high-reso.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/high-reso.jpg -------------------------------------------------------------------------------- /assets/projector_comparsion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/projector_comparsion.jpg -------------------------------------------------------------------------------- /assets/title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/title.png -------------------------------------------------------------------------------- /assets/vis-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-1.jpg -------------------------------------------------------------------------------- /assets/vis-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/assets/vis-2.jpg -------------------------------------------------------------------------------- /docs/evaluation.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | ## Docvqa 4 | 1. Download `test_v1.0.json` to `./playground/data/eval/docvqa/data`. 5 | 2. set `--image-folder` to the path of [docvqa](https://rrc.cvc.uab.es/?ch=17&com=downloads) images. 6 | 3. Multi-GPU inference. 7 | ``` 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/docvqa.sh 9 | ``` 10 | 4. Submit the results to the [evaluation server](https://rrc.cvc.uab.es/?ch=17&com=evaluation&task=1): `./playground/data/eval/docvqa/answers/` 11 | 12 | 13 | ## GQA 14 | 1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release. 15 | Multi-GPU inference. 16 | 2. Multi-GPU inference. 17 | ``` 18 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh 19 | ``` 20 | 21 | ## MMBench 22 | 1. Download [mmbench_dev_20230712.tsv](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`. 23 | 2. Single-GPU inference. 24 | ``` 25 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh 26 | ``` 27 | 3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`. 28 | 29 | 30 | ## MME 31 | 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation). 32 | 2. Downloaded images to `MME_Benchmark_release_version`. 33 | 3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`. 34 | 4. Single-GPU inference and evaluate. 35 | ```Shell 36 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh 37 | ``` 38 | 39 | ## MMMU_val 40 | 1. Download the [data](https://huggingface.co/datasets/MMMU/MMMU/tree/main). 41 | 2. Set `--data_path` to the path to MMMU images. 42 | 3. Multi-GPU inference. 43 | ``` 44 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/mmmu_val.sh 45 | ``` 46 | 47 | ## MM-Vet 48 | 1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`. 49 | 2. Single-GPU inference. 50 | ```Shell 51 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh 52 | ``` 53 | 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook. 54 | 55 | ## OCRBench 56 | 1. Download the [data](https://github.com/Yuliang-Liu/MultimodalOCR). 57 | 2. Set `--image_folder` to the path to OCRBench images, set `--OCRBench_file` to the json file of OCRBench. 58 | 3. Single-GPU inference. 59 | ```Shell 60 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/ocr_bench.sh 61 | ``` 62 | 63 | ## POPE 64 | 1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`. 65 | 2. Single-GPU inference and evaluate. 66 | ```Shell 67 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh 68 | ``` 69 | 70 | ### TextVQA 71 | 1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and extract to `./playground/data/eval/textvqa`. 72 | 2. Download[images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and set `--image-folder` to the path to textvqa images. 73 | 2. Single-GPU inference and evaluate. 74 | ```Shell 75 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh 76 | ``` 77 | 78 | ## Vizwiz 79 | 1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`. 80 | 2. Single-GPU inference. 81 | ```Shell 82 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh 83 | ``` 84 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`. 85 | 86 | 87 | ## VQAv2 88 | 1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and set `--image-folder` to the path to `test2015`. 89 | 2. Multi-GPU inference. 90 | ```Shell 91 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh 92 | ``` 93 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`. 94 | -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation Guidelines 2 | We provide detailed instructions for evaluation. 3 | To execute our evaluation script, please ensure that the structure of your model outputs is the same as ours. 4 | 5 | We provide two options: 6 | 1. Evaluation only: you can parse the response on your own and simply provide one file with all the final predictions. 7 | 2. Parse and evaluation: you can leave all the responses to us with the output formats shown below. 8 | 9 | ## Evaluation Only 10 | If you want to use your own parsing logic and *only provide the final answer*, you can use `main_eval_only.py`. 11 | 12 | You can provide all the outputs in *one file* in the following format: 13 | 14 | ``` 15 | { 16 | "validation_Accounting_1": "D", # strictly "A", "B", "C", "D" for multi-choice question 17 | "validation_Architecture_and_Engineering_14": "0.0", # any string response for open question. 18 | ... 19 | } 20 | ``` 21 | Then run eval_only with: 22 | ``` 23 | python main_eval_only.py --output_path ./example_outputs/llava1.5_13b/total_val_output.json 24 | ``` 25 | 26 | Please refer to [example output](https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/example_outputs/llava1.5_13b/total_val_output.json) for a detailed prediction file form. 27 | 28 | 29 | ## Parse and Evaluation 30 | You can also provide response and run the `main_parse_and_eval.py` to use our answer parsing processing and evaluation pipeline as follows: 31 | 32 | ### Output folder structure 33 | 34 | ``` 35 | └── model_name 36 | ├── category_name (e.g., Accounting) 37 | │ ├── output.json 38 | └── category_name (e.g., Electronics) 39 | ├── output.json 40 | ... 41 | ``` 42 | 43 | ### Output file 44 | Each `output.json`` has a list of dict containing instances for evaluation (). 45 | ``` 46 | [ 47 | { 48 | "id": "validation_Electronics_28", 49 | "question_type": "multiple-choice", 50 | "answer": "A", # given answer 51 | "all_choices": [ # create using `get_multi_choice_info` in 52 | "A", 53 | "B", 54 | "C", 55 | "D" 56 | ], 57 | "index2ans": { # create using `get_multi_choice_info` in 58 | "A": "75 + 13.3 cos(250t - 57.7°)V", 59 | "B": "75 + 23.3 cos(250t - 57.7°)V", 60 | "C": "45 + 3.3 cos(250t - 57.7°)V", 61 | "D": "95 + 13.3 cos(250t - 57.7°)V" 62 | }, 63 | "response": "B" # model response 64 | }, 65 | { 66 | "id": "validation_Electronics_29", 67 | "question_type": "short-answer", 68 | "answer": "30", # given answer 69 | "response": "36 watts" # model response 70 | }, 71 | ... 72 | ] 73 | ``` 74 | 75 | ### Evaluation 76 | ``` 77 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject ALL # all subject 78 | 79 | # OR you can sepecify one subject for the evaluation 80 | 81 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject elec # short name for Electronics. use --help for all short names 82 | 83 | ``` 84 | 85 | `main_parse_and_eval.py` will generate `parsed_output.json` and `result.json` in the subfolder under the same category with output.json, respectively. 86 | 87 | ``` 88 | ├── Accounting 89 | │ ├── output.json 90 | │ ├── parsed_output.json 91 | │ └── result.json 92 | └── Electronics 93 | ├── output.json 94 | ├── parsed_output.json 95 | └── result.json 96 | ... 97 | ``` 98 | 99 | ### Print Results 100 | You can print results locally if you want. (use `pip install tabulate` if you haven't) 101 | ``` 102 | python print_results.py --path ./example_outputs/llava1.5_13b 103 | # Results may be slightly different due to the ramdon selection for fail response 104 | ``` 105 | 106 | 107 | 108 | ##### Run Llava 109 | In case if you want to reproduce the results of some of the models, please go check run_llava.py as an example. 110 | 111 | By seeting up the env following the [llava official repo](https://github.com/haotian-liu/LLaVA) and installing `datasets` packages by huggingface, you can run llava viathe following command: 112 | 113 | ``` 114 | CUDA_VISIBLE_DEVICES=0 nohup python run_llava.py \ 115 | --output_path example_outputs/llava1.5_13b_val.json \ 116 | --model_path liuhaotian/llava-v1.5-13b \ 117 | --config_path configs/llava1.5.yaml 118 | ``` 119 | 120 | Then you can evaluate the results via the very first pipeline. 121 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/configs/llava1.5.yaml: -------------------------------------------------------------------------------- 1 | task_instructions: 2 | - "" 3 | multi_choice_example_format: 4 | - "{} 5 | 6 | {} 7 | 8 | Answer with the option's letter from the given choices directly." 9 | 10 | short_ans_example_format: 11 | - "{} 12 | 13 | Answer the question using a single word or phrase." 14 | temperature: 15 | - 0 -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/convert_to_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import ArgumentParser 4 | 5 | from utils.eval_utils import evaluate 6 | from utils.data_utils import save_json 7 | 8 | 9 | def main(): 10 | parser = ArgumentParser() 11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt', 12 | help='name of saved json') 13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json', 14 | help='name of saved json') 15 | 16 | args = parser.parse_args() 17 | out_samples = [json.loads(line) for line in open(args.result_file)] 18 | out_json = {} 19 | for _sample in out_samples: 20 | _result = _sample['parsed_pred'] 21 | if isinstance(_result, list): 22 | _result = str(_result[0]) 23 | out_json[_sample['id']] = _result 24 | 25 | save_json(args.output_path, out_json) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import ArgumentParser 4 | 5 | from utils.eval_utils import evaluate 6 | from utils.data_utils import save_json 7 | 8 | 9 | def main(): 10 | parser = ArgumentParser() 11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt', 12 | help='name of saved json') 13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json', 14 | help='name of saved json') 15 | 16 | args = parser.parse_args() 17 | out_samples = [json.loads(line) for line in open(args.result_file)] 18 | 19 | judge_dict, metric_dict = evaluate(out_samples) 20 | metric_dict.update({"num_example": len(out_samples)}) 21 | judge_dict['metric_dict'] = metric_dict 22 | save_dir = '/'.join(args.output_path.split('/')[:-1]) 23 | if not os.path.exists(save_dir): 24 | os.makedirs(save_dir) 25 | save_json(args.output_path, judge_dict) 26 | 27 | print(metric_dict) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/main_eval_only.py: -------------------------------------------------------------------------------- 1 | """Parse and Evalate""" 2 | import os 3 | import json 4 | 5 | import pdb 6 | from argparse import ArgumentParser 7 | 8 | from utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT 9 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = ArgumentParser() 15 | parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.") 16 | parser.add_argument('--answer_path', type=str, default="./answer_dict_val.json", help="Answer file path.") 17 | args = parser.parse_args() 18 | 19 | output_dict = json.load(open(args.output_path)) 20 | answer_dict = json.load(open(args.answer_path)) 21 | 22 | # group by category 23 | output_dict_w_cat = {} 24 | for data_id, parsed_pred in output_dict.items(): 25 | category = "_".join(data_id.split("_")[1:-1]) 26 | if category not in output_dict_w_cat: 27 | output_dict_w_cat.update({category: {}}) 28 | output_dict_w_cat[category].update({data_id: parsed_pred}) 29 | 30 | # group by category 31 | answer_dict_w_cat = {} 32 | for data_id, parsed_pred in answer_dict.items(): 33 | category = "_".join(data_id.split("_")[1:-1]) 34 | if category not in answer_dict_w_cat: 35 | answer_dict_w_cat.update({category: {}}) 36 | answer_dict_w_cat[category].update({data_id: parsed_pred}) 37 | 38 | evaluation_result = {} 39 | 40 | for category in CAT_SHORT2LONG.values(): 41 | print("Evaluating: {}".format(category)) 42 | # get cat_outputs and cat_answers 43 | try: 44 | cat_outputs = output_dict_w_cat[category] 45 | cat_answers = answer_dict_w_cat[category] 46 | except KeyError: 47 | print("Skipping {} for not found".format(category)) 48 | continue 49 | 50 | exampels_to_eval = [] 51 | for data_id, parsed_pred in cat_outputs.items(): 52 | question_type = cat_answers[data_id]['question_type'] 53 | if question_type != 'multiple-choice': 54 | parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.) 55 | else: 56 | parsed_pred = parsed_pred 57 | 58 | exampels_to_eval.append({ 59 | "id": data_id, 60 | "question_type": question_type, 61 | "answer": cat_answers[data_id]['ground_truth'], 62 | "parsed_pred": parsed_pred 63 | }) 64 | 65 | judge_dict, metric_dict = evaluate(exampels_to_eval) 66 | metric_dict.update({"num_example": len(exampels_to_eval)}) 67 | 68 | evaluation_result[category] = metric_dict 69 | 70 | printable_results = {} 71 | # pdb.set_trace() 72 | # add domain Subject 73 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items(): 74 | in_domain_cat_results = {} 75 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT 76 | if cat_name in evaluation_result.keys(): 77 | in_domain_cat_results[cat_name] = evaluation_result[cat_name] 78 | else: 79 | pass 80 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results) 81 | in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()]) 82 | printable_results['Overall-' + domain] = {"num": int(in_domain_data_num), 83 | "acc": round(in_domain_ins_acc, 3) 84 | } 85 | # add sub category 86 | for cat_name, cat_results in in_domain_cat_results.items(): 87 | printable_results[cat_name] = {"num": int(cat_results['num_example']), 88 | "acc": round(cat_results['acc'], 3) 89 | } 90 | 91 | # table.append(["-----------------------------", "-----", "----"]) 92 | all_ins_acc = calculate_ins_level_acc(evaluation_result) 93 | printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]), 94 | "acc": round(all_ins_acc, 3) 95 | } 96 | 97 | print(printable_results) 98 | 99 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/main_parse_and_eval.py: -------------------------------------------------------------------------------- 1 | """Parse and Evalate""" 2 | import os 3 | import json 4 | from argparse import ArgumentParser 5 | 6 | from utils.data_utils import save_json, CAT_SHORT2LONG 7 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response 8 | 9 | 10 | if __name__ == '__main__': 11 | 12 | parser = ArgumentParser() 13 | parser.add_argument('--path', type=str, default="./example_outputs/llava1.5_13b", help="The path to model output directory.") 14 | parser.add_argument('--subject', nargs='+', 15 | help=f'The name of the mmmu sub-category. Availble: {CAT_SHORT2LONG.keys()} or ALL') 16 | 17 | args = parser.parse_args() 18 | if args.subject[0] == 'ALL': 19 | args.subject = CAT_SHORT2LONG.keys() 20 | 21 | ex_output_path = os.path.join(args.path) 22 | 23 | all_results = {} 24 | for cat_short in args.subject: 25 | category = CAT_SHORT2LONG[cat_short] 26 | print("Evaluating: {}".format(category)) 27 | if category not in os.listdir(ex_output_path): 28 | print("Skipping {} for not found".format(category)) 29 | else: 30 | cat_folder_path = os.path.join(ex_output_path, category) 31 | cat_outputs = json.load(open(os.path.join(cat_folder_path, 'output.json'))) 32 | # Evaluation 33 | eval_samples = [] 34 | for cat_output in cat_outputs: 35 | response = cat_output['response'] 36 | if cat_output['question_type'] == 'multiple-choice': 37 | all_choices = cat_output['all_choices'] 38 | index2ans = cat_output['index2ans'] 39 | parsed_pred = parse_multi_choice_response(response, all_choices, index2ans) 40 | eval_samples.append( 41 | { 42 | 'id': cat_output['id'], 43 | 'question_type': cat_output['question_type'], 44 | 'answer': cat_output['answer'], # the content in option, not answer index. 45 | 'response': response, 46 | 'parsed_pred': parsed_pred, 47 | 'index2ans': index2ans, 48 | } 49 | ) 50 | else: # open 51 | parsed_pred = parse_open_response(response) 52 | eval_samples.append( 53 | { 54 | 'id': cat_output['id'], 55 | 'question_type': cat_output['question_type'], 56 | 'answer': cat_output['answer'], 57 | 'response': response, 58 | 'parsed_pred': parsed_pred, 59 | } 60 | ) 61 | 62 | print("Num of valid samples: {}, Expected Num: {}".format(len(eval_samples), len(cat_outputs))) 63 | 64 | judge_dict, metric_dict = evaluate(eval_samples) 65 | metric_dict.update({"num_example": len(eval_samples)}) 66 | for eval_sample in eval_samples: 67 | eval_sample.update({"judge": judge_dict[eval_sample['id']]}) 68 | 69 | save_json(os.path.join(cat_folder_path, 'parsed_output.json'), eval_samples) 70 | save_json(os.path.join(cat_folder_path, 'result.json'), metric_dict) 71 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/print_results.py: -------------------------------------------------------------------------------- 1 | # Beautiful table to print results of all categories 2 | 3 | import os 4 | from typing import Dict 5 | import json 6 | import numpy as np 7 | from tabulate import tabulate 8 | 9 | from argparse import ArgumentParser 10 | 11 | from utils.data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT 12 | 13 | from utils.eval_utils import calculate_ins_level_acc 14 | 15 | def main(): 16 | parser = ArgumentParser() 17 | parser.add_argument('--path', type=str, default="./example_outputs/blip2_flant5xxl", help="The path to output directory.") 18 | args = parser.parse_args() 19 | 20 | # load all results 21 | all_results = {} 22 | for cat_folder_name in os.listdir(args.path): 23 | if cat_folder_name in CAT_SHORT2LONG.values(): 24 | cat_folder_path = os.path.join(args.path, cat_folder_name) 25 | result_path = os.path.join(cat_folder_path, 'result.json') 26 | if os.path.exists(result_path): 27 | cat_results = json.load(open(result_path)) 28 | all_results[cat_folder_name] = cat_results 29 | 30 | # print results 31 | headers = ['Subject', 'Data Num', 'Acc'] 32 | table = [] 33 | 34 | # add domain Subject 35 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items(): 36 | in_domain_cat_results = {} 37 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT 38 | if cat_name in all_results.keys(): 39 | in_domain_cat_results[cat_name] = all_results[cat_name] 40 | else: 41 | pass 42 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results) 43 | in_domain_data_num = np.sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()]) 44 | table.append(['Overall-' + domain, int(in_domain_data_num), round(in_domain_ins_acc, 3)]) 45 | # add sub category 46 | for cat_name, cat_results in in_domain_cat_results.items(): 47 | table.append([cat_name, int(cat_results['num_example']), round(cat_results['acc'], 3)]) 48 | # table.append(["-----------------------------", "-----", "----"]) 49 | 50 | # table.append(["-----------------------------", "-----", "----"]) 51 | all_ins_acc = calculate_ins_level_acc(all_results) 52 | table.append(['Overall', np.sum([cat_results['num_example'] for cat_results in all_results.values()]), round(all_ins_acc, 3)]) 53 | 54 | print(tabulate(table, headers=headers, tablefmt='orgtbl')) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import math 7 | from tqdm import tqdm 8 | import json 9 | 10 | from datasets import load_dataset, concatenate_datasets 11 | from argparse import ArgumentParser 12 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 13 | from llava.model import * 14 | from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path 15 | from utils.data_utils import load_yaml, construct_prompt, save_json, process_single_sample, CAT_SHORT2LONG 16 | from utils.model_utils_ind import call_llava_engine_df 17 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response 18 | import torch.nn.functional as F 19 | from functools import partial 20 | from llava.patch_divide import Image_Patch 21 | from torchvision.transforms import Compose, ToTensor, Normalize 22 | 23 | def set_seed(seed_value): 24 | """ 25 | Set the seed for PyTorch (both CPU and CUDA), Python, and NumPy for reproducible results. 26 | 27 | :param seed_value: An integer value to be used as the seed. 28 | """ 29 | torch.manual_seed(seed_value) 30 | if torch.cuda.is_available(): 31 | torch.cuda.manual_seed(seed_value) 32 | torch.cuda.manual_seed_all(seed_value) # For multi-GPU setups 33 | random.seed(seed_value) 34 | np.random.seed(seed_value) 35 | torch.backends.cudnn.deterministic = True 36 | torch.backends.cudnn.benchmark = False 37 | 38 | def split_list(lst, n): 39 | """Split a list into n (roughly) equal-sized chunks""" 40 | chunk_size = math.ceil(len(lst) / n) # integer division 41 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 42 | 43 | 44 | def get_chunk(lst, n, k): 45 | chunks = split_list(lst, n) 46 | return chunks[k] 47 | 48 | 49 | def main(): 50 | parser = ArgumentParser() 51 | # parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json', 52 | # help='name of saved json') 53 | parser.add_argument('--config_path', type=str, default="configs/llava1.5.yaml") 54 | parser.add_argument('--data_path', type=str, default="MMMU/MMMU") # hf dataset path. 55 | parser.add_argument('--model_path', type=str, default="liuhaotian/llava-v1.5-13b") 56 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 57 | parser.add_argument("--num-chunks", type=int, default=1) 58 | parser.add_argument("--chunk-idx", type=int, default=0) 59 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 60 | parser.add_argument('--split', type=str, default='validation') 61 | parser.add_argument('--seed', type=int, default=42) 62 | parser.add_argument('--load_8bit', type=bool, default=False) 63 | 64 | args = parser.parse_args() 65 | # device = torch.device("cuda") if torch.cuda.is_available() else "cpu" 66 | set_seed(args.seed) 67 | 68 | print('llava_initializing...') 69 | processor = None 70 | call_model_engine = call_llava_engine_df 71 | 72 | # load config and process to one value 73 | args.config = load_yaml(args.config_path) 74 | for key, value in args.config.items(): 75 | if key != 'eval_params' and type(value) == list: 76 | assert len(value) == 1, 'key {} has more than one value'.format(key) 77 | args.config[key] = value[0] 78 | 79 | model_path = os.path.expanduser(args.model_path) 80 | model_name = get_model_name_from_path(model_path) 81 | tokenizer = AutoTokenizer.from_pretrained( 82 | args.model_path, 83 | model_max_length = 2048, 84 | padding_side="right", 85 | use_fast = True 86 | ) 87 | model = LlavaLlamaForCausalLM.from_pretrained( 88 | args.model_path, 89 | torch_dtype=torch.bfloat16, 90 | ).cuda() 91 | 92 | for m in model.modules(): 93 | m.tokenizer = tokenizer 94 | 95 | vision_tower = model.get_vision_tower() 96 | if not vision_tower.is_loaded: 97 | vision_tower.load_model() 98 | vision_tower.to(device='cuda', dtype=torch.float16) 99 | image_processor = vision_tower.image_processor 100 | 101 | patch_num = getattr(model.config, 'patch_num', '9') 102 | image_patch = Image_Patch(patch_num=int(patch_num)) 103 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))]) 104 | 105 | 106 | # run for each subject 107 | sub_dataset_list = [] 108 | subjects = [x for x in CAT_SHORT2LONG.values()] 109 | ''' 110 | subjects = [ 111 | 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 112 | 'Energy_and_Power', 'Materials', 'Mechanical_Engineering' 113 | ] 114 | ''' 115 | for subject in tqdm(subjects): 116 | sub_dataset = load_dataset(args.data_path, subject, split=args.split) 117 | sub_dataset_list.append(sub_dataset) 118 | 119 | sub_dataset_list = get_chunk(sub_dataset_list, args.num_chunks, args.chunk_idx) 120 | 121 | # merge all dataset 122 | dataset = concatenate_datasets(sub_dataset_list) 123 | 124 | # samples = [] 125 | out_samples = [] 126 | for sample in tqdm(dataset): 127 | sample = process_single_sample(sample) 128 | 129 | sample = construct_prompt(sample, args.config) 130 | if sample['image']: 131 | image = sample['image'].convert('RGB') 132 | if model.config.image_aspect_ratio == 'slice': 133 | image = preprocess(image) 134 | image = image.unsqueeze(0) 135 | h, w = image.shape[-2:] 136 | block_size = 336 137 | h_block, w_block = image_patch.calculate(h, w) 138 | h_ratio = block_size*h_block/h 139 | w_ratio = block_size*w_block/w 140 | if h_ratio<=w_ratio: 141 | w_ = min(block_size*w_block, round(w*h_ratio)) 142 | h_ = block_size*h_block 143 | else: 144 | w_ = block_size*w_block 145 | h_ = min(block_size*h_block, round(h*w_ratio)) 146 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 147 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device) 148 | image[:, :, :h_, :w_] = image_inter 149 | 150 | split_images = [] 151 | for i_ in range(h_block): 152 | for j_ in range(w_block): 153 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)] 154 | split_images.append(image_s) 155 | if len(split_images)>1: 156 | h_ratio = block_size/h 157 | w_ratio = block_size/w 158 | if h_ratio<=w_ratio: 159 | w_ = min(block_size, round(w*h_ratio)) 160 | h_ = block_size 161 | else: 162 | w_ = block_size 163 | h_ = min(block_size, round(h*w_ratio)) 164 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 165 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device) 166 | image_s[:, :, :h_, :w_] = image_inter 167 | split_images.append(image_s) 168 | image_tensor = torch.cat(split_images, dim=0) 169 | else: 170 | image_tensor = process_images([image], image_processor, model.config)[0] 171 | image_tensor = image_tensor.unsqueeze(0) 172 | h_block = 1 173 | w_block = 1 174 | 175 | sample['image'] = image_tensor 176 | 177 | # samples.append(sample) 178 | mode = model.config.image_aspect_ratio 179 | with torch.no_grad(): 180 | response = call_model_engine(args, sample, model, tokenizer, processor, h_block, w_block, mode) 181 | if sample['question_type'] == 'multiple-choice': 182 | parsed_pred = parse_multi_choice_response(response, sample['all_choices'], sample['index2ans']) 183 | out_sample = { 184 | 'id': sample['id'], 185 | 'question_type': sample['question_type'], 186 | 'answer': sample['answer'], 187 | 'response': response, 188 | 'parsed_pred': parsed_pred, 189 | 'index2ans': sample['index2ans'], 190 | } 191 | else: # open question 192 | parsed_pred = parse_open_response(response) 193 | out_sample = { 194 | 'id': sample['id'], 195 | 'question_type': sample['question_type'], 196 | 'answer': sample['answer'], 197 | 'response': response, 198 | 'parsed_pred': parsed_pred, 199 | } 200 | out_samples.append(out_sample) 201 | 202 | answers_file = os.path.expanduser(args.answers_file) 203 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 204 | ans_file = open(answers_file, "w") 205 | for i, sample in enumerate(out_samples): 206 | ans_file.write(json.dumps(sample) + "\n") 207 | ans_file.close() 208 | 209 | if __name__ == '__main__': 210 | main() 211 | 212 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/data_utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/eval_utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/eval/mmmu/eval/utils/__pycache__/model_utils_ind.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | """Utils for data load, save, and process (e.g., prompt construction)""" 2 | 3 | import os 4 | import json 5 | import yaml 6 | import re 7 | 8 | 9 | DOMAIN_CAT2SUB_CAT = { 10 | 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'], 11 | 'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'], 12 | 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',], 13 | 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'], 14 | 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'], 15 | 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'], 16 | } 17 | 18 | 19 | CAT_SHORT2LONG = { 20 | 'acc': 'Accounting', 21 | 'agri': 'Agriculture', 22 | 'arch': 'Architecture_and_Engineering', 23 | 'art': 'Art', 24 | 'art_theory': 'Art_Theory', 25 | 'bas_med': 'Basic_Medical_Science', 26 | 'bio': 'Biology', 27 | 'chem': 'Chemistry', 28 | 'cli_med': 'Clinical_Medicine', 29 | 'cs': 'Computer_Science', 30 | 'design': 'Design', 31 | 'diag_med': 'Diagnostics_and_Laboratory_Medicine', 32 | 'econ': 'Economics', 33 | 'elec': 'Electronics', 34 | 'ep': 'Energy_and_Power', 35 | 'fin': 'Finance', 36 | 'geo': 'Geography', 37 | 'his': 'History', 38 | 'liter': 'Literature', 39 | 'manage': 'Manage', 40 | 'mark': 'Marketing', 41 | 'mate': 'Materials', 42 | 'math': 'Math', 43 | 'mech': 'Mechanical_Engineering', 44 | 'music': 'Music', 45 | 'phar': 'Pharmacy', 46 | 'phys': 'Physics', 47 | 'psy': 'Psychology', 48 | 'pub_health': 'Public_Health', 49 | 'socio': 'Sociology' 50 | } 51 | 52 | # DATA SAVING 53 | def save_json(filename, ds): 54 | with open(filename, 'w') as f: 55 | json.dump(ds, f, indent=4) 56 | 57 | 58 | def get_multi_choice_info(options): 59 | """ 60 | Given the list of options for multiple choice question 61 | Return the index2ans and all_choices 62 | """ 63 | 64 | start_chr = 'A' 65 | all_choices = [] 66 | index2ans = {} 67 | for i, option in enumerate(options): 68 | index2ans[chr(ord(start_chr) + i)] = option 69 | all_choices.append(chr(ord(start_chr) + i)) 70 | 71 | return index2ans, all_choices 72 | 73 | def load_yaml(file_path): 74 | with open(file_path, 'r') as stream: 75 | try: 76 | yaml_dict = yaml.safe_load(stream) 77 | except yaml.YAMLError as exc: 78 | print(exc) 79 | 80 | return yaml_dict 81 | 82 | 83 | def parse_img_path(text): 84 | matches = re.findall("", text) 85 | return matches 86 | 87 | def process_single_sample(data): 88 | question = data['question'] 89 | o_imgs_paths = [] 90 | for option in data['options']: 91 | current_o_imgs_paths = parse_img_path(option) 92 | for img_path in current_o_imgs_paths: 93 | o_imgs_paths.append(img_path) 94 | 95 | if len(o_imgs_paths) > 1: # multiple images in options, used for random selection 96 | return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'], 97 | 'image': None, 'question_type': data['question_type']} 98 | else: 99 | return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'], 100 | 'image': data['image_1'], 'question_type': data['question_type']} 101 | 102 | 103 | # DATA SAVING 104 | def save_json(filename, ds): 105 | with open(filename, 'w') as f: 106 | json.dump(ds, f, indent=4) 107 | 108 | def save_jsonl(filename, data): 109 | """ 110 | Save a dictionary of data to a JSON Lines file with the filename as key and caption as value. 111 | 112 | Args: 113 | filename (str): The path to the file where the data should be saved. 114 | data (dict): The dictionary containing the data to save where key is the image path and value is the caption. 115 | """ 116 | with open(filename, 'w', encoding='utf-8') as f: 117 | for img_path, caption in data.items(): 118 | # Extract the base filename without the extension 119 | base_filename = os.path.basename(img_path) 120 | # Create a JSON object with the filename as the key and caption as the value 121 | json_record = json.dumps({base_filename: caption}, ensure_ascii=False) 122 | # Write the JSON object to the file, one per line 123 | f.write(json_record + '\n') 124 | 125 | def save_args(args, path_dir): 126 | argsDict = args.__dict__ 127 | with open(path_dir + 'setting.txt', 'w') as f: 128 | f.writelines('------------------ start ------------------' + '\n') 129 | for eachArg, value in argsDict.items(): 130 | f.writelines(eachArg + ' : ' + str(value) + '\n') 131 | f.writelines('------------------- end -------------------') 132 | 133 | 134 | 135 | # DATA PROCESSING 136 | def construct_prompt(sample, config): 137 | question = sample['question'] 138 | options = eval(sample['options']) 139 | example = "" 140 | if sample['question_type'] == 'multiple-choice': 141 | start_chr = 'A' 142 | prediction_range = [] 143 | index2ans = {} 144 | for option in options: 145 | prediction_range.append(start_chr) 146 | example += f"({start_chr}) {option}\n" 147 | index2ans[start_chr] = option 148 | start_chr = chr(ord(start_chr) + 1) 149 | empty_prompt_sample_structure = config['multi_choice_example_format'] 150 | empty_prompt = empty_prompt_sample_structure.format(question, example) 151 | res_dict = {} 152 | res_dict['index2ans'] = index2ans 153 | res_dict['correct_choice'] = sample['answer'] 154 | res_dict['all_choices'] = prediction_range 155 | res_dict['empty_prompt'] = empty_prompt 156 | if config['task_instructions']: 157 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt 158 | else: 159 | res_dict['final_input_prompt'] = empty_prompt 160 | 161 | res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')] 162 | else: 163 | empty_prompt_sample_structure = config['short_ans_example_format'] 164 | empty_prompt = empty_prompt_sample_structure.format(question) 165 | res_dict = {} 166 | res_dict['empty_prompt'] = empty_prompt 167 | if config['task_instructions']: 168 | res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt 169 | else: 170 | res_dict['final_input_prompt'] = empty_prompt 171 | res_dict['gt_content'] = sample['answer'] 172 | 173 | res_dict.update(sample) 174 | return res_dict -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | import torch 3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 4 | from llava.conversation import conv_templates, SeparatorStyle 5 | 6 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None): 7 | 8 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 9 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 10 | 11 | def insert_separator(X, sep): 12 | return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] 13 | 14 | input_ids = [] 15 | offset = 0 16 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 17 | offset = 1 18 | input_ids.append(prompt_chunks[0][0]) 19 | 20 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 21 | input_ids.extend(x[offset:]) 22 | 23 | if return_tensors is not None: 24 | if return_tensors == 'pt': 25 | return torch.tensor(input_ids, dtype=torch.long) 26 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 27 | return input_ids 28 | 29 | def deal_with_prompt(input_text, mm_use_im_start_end): 30 | qs = input_text 31 | if mm_use_im_start_end: 32 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 33 | else: 34 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 35 | return qs 36 | 37 | prompt = sample['final_input_prompt'] 38 | prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end) 39 | conv = conv_templates['vicuna_v1'].copy() 40 | conv.append_message(conv.roles[0], prompt) 41 | conv.append_message(conv.roles[1], None) 42 | prompt = conv.get_prompt() 43 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 44 | image = sample['image'] 45 | if image is not None: 46 | output_ids = model.generate( 47 | input_ids, 48 | images=image.unsqueeze(0).half().cuda(), 49 | do_sample=True, 50 | temperature=1, 51 | top_p=None, 52 | num_beams=5, 53 | max_new_tokens=128, 54 | use_cache=True) 55 | 56 | # input_token_len = input_ids.shape[1] 57 | # n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 58 | # if n_diff_input_output > 0: 59 | # print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 60 | # response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 61 | response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 62 | else: # multiple images actually 63 | if sample['question_type'] == 'multiple-choice': 64 | all_choices = sample['all_choices'] 65 | response = random.choice(all_choices) 66 | else: 67 | response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS' 68 | 69 | return response 70 | 71 | 72 | def llava_image_processor(raw_image, vis_processors=None): 73 | image_tensor = vis_processors.preprocess(raw_image, return_tensors='pt')['pixel_values'][0] 74 | return image_tensor 75 | -------------------------------------------------------------------------------- /llava/eval/mmmu/eval/utils/model_utils_ind.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | import torch 3 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 4 | from llava.conversation import conv_templates, SeparatorStyle 5 | from functools import partial 6 | from llava.mm_utils import tokenizer_image_token 7 | 8 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None, h_block=None, w_block=None, mode=None): 9 | 10 | def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens): 11 | if ocr_tokens is not None: 12 | qs = input_text + '\n' + ocr_tokens 13 | else: 14 | qs = input_text 15 | if mm_use_im_start_end: 16 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 17 | else: 18 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 19 | return qs 20 | 21 | prompt = sample['final_input_prompt'] 22 | ocr_tokens = sample.get('ocr', None) 23 | prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens) 24 | conv = conv_templates[args.conv_mode].copy() 25 | conv.append_message(conv.roles[0], prompt) 26 | conv.append_message(conv.roles[1], None) 27 | prompt = conv.get_prompt() 28 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 29 | image = sample['image'] 30 | 31 | if image is not None: 32 | model.orig_forward = model.forward 33 | model.forward = partial(model.orig_forward, 34 | mode=mode, 35 | h_block = [h_block], 36 | w_block = [w_block] 37 | ) 38 | output_ids = model.generate( 39 | input_ids, 40 | images=image.bfloat16().cuda(), 41 | do_sample=False, 42 | temperature=0, 43 | num_beams=1, 44 | top_p=None, 45 | max_new_tokens=1024, 46 | use_cache=True) 47 | 48 | model.forward = model.orig_forward 49 | 50 | input_token_len = input_ids.shape[1] 51 | 52 | response = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 53 | # response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n') 54 | else: # multiple images actually 55 | if sample['question_type'] == 'multiple-choice': 56 | all_choices = sample['all_choices'] 57 | response = random.choice(all_choices) 58 | else: 59 | response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS' 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | # new stopping implementation 14 | class KeywordsStoppingCriteria(StoppingCriteria): 15 | def __init__(self, keywords, tokenizer, input_ids): 16 | self.keywords = keywords 17 | self.tokenizer = tokenizer 18 | self.start_len = None 19 | self.input_ids = input_ids 20 | 21 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 22 | if self.start_len is None: 23 | self.start_len = self.input_ids.shape[1] 24 | else: 25 | outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0] 26 | for keyword in self.keywords: 27 | if keyword in outputs: 28 | return True 29 | return False 30 | 31 | 32 | @torch.inference_mode() 33 | def eval_model(model_name, questions_file, answers_file): 34 | # Model 35 | disable_torch_init() 36 | model_name = os.path.expanduser(model_name) 37 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 38 | model = AutoModelForCausalLM.from_pretrained(model_name, 39 | torch_dtype=torch.float16).cuda() 40 | 41 | 42 | ques_file = open(os.path.expanduser(questions_file), "r") 43 | ans_file = open(os.path.expanduser(answers_file), "w") 44 | for i, line in enumerate(tqdm(ques_file)): 45 | idx = json.loads(line)["question_id"] 46 | qs = json.loads(line)["text"] 47 | cat = json.loads(line)["category"] 48 | conv = default_conversation.copy() 49 | conv.append_message(conv.roles[0], qs) 50 | prompt = conv.get_prompt() 51 | inputs = tokenizer([prompt]) 52 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 53 | stopping_criteria = KeywordsStoppingCriteria([conv.sep], tokenizer, input_ids) 54 | output_ids = model.generate( 55 | input_ids, 56 | do_sample=True, 57 | use_cache=True, 58 | temperature=0.7, 59 | max_new_tokens=1024, 60 | stopping_criteria=[stopping_criteria]) 61 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 62 | try: 63 | index = outputs.index(conv.sep, len(prompt)) 64 | except ValueError: 65 | outputs += conv.sep 66 | index = outputs.index(conv.sep, len(prompt)) 67 | 68 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 69 | ans_id = shortuuid.uuid() 70 | ans_file.write(json.dumps({"question_id": idx, 71 | "text": outputs, 72 | "answer_id": ans_id, 73 | "model_id": model_name, 74 | "metadata": {}}) + "\n") 75 | ans_file.flush() 76 | ans_file.close() 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 81 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 82 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 83 | args = parser.parse_args() 84 | 85 | eval_model(args.model_name, args.question_file, args.answers_file) 86 | -------------------------------------------------------------------------------- /llava/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.utils import disable_torch_init 11 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path, KeywordsStoppingCriteria 12 | 13 | from PIL import Image 14 | import math 15 | import torch.nn.functional as F 16 | from functools import partial 17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 18 | from llava.model import * 19 | from llava.patch_divide import Image_Patch 20 | from torchvision.transforms import Compose, ToTensor, Normalize 21 | 22 | 23 | def split_list(lst, n): 24 | """Split a list into n (roughly) equal-sized chunks""" 25 | chunk_size = math.ceil(len(lst) / n) # integer division 26 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 27 | 28 | 29 | def get_chunk(lst, n, k): 30 | chunks = split_list(lst, n) 31 | return chunks[k] 32 | 33 | 34 | def eval_model(args): 35 | # Model 36 | disable_torch_init() 37 | model_path = os.path.expanduser(args.model_path) 38 | model_name = get_model_name_from_path(model_path) 39 | tokenizer = AutoTokenizer.from_pretrained( 40 | model_path, 41 | model_max_length = 2048, 42 | padding_side="right", 43 | use_fast = True 44 | ) 45 | 46 | model = LlavaLlamaForCausalLM.from_pretrained( 47 | model_path, 48 | torch_dtype=torch.bfloat16, 49 | ).cuda() 50 | 51 | for m in model.modules(): 52 | m.tokenizer = tokenizer 53 | 54 | vision_tower = model.get_vision_tower() 55 | if not vision_tower.is_loaded: 56 | vision_tower.load_model() 57 | vision_tower.to(device='cuda', dtype=torch.float16) 58 | image_processor = vision_tower.image_processor 59 | 60 | patch_num = getattr(model.config, 'patch_num', '9') 61 | image_patch = Image_Patch(patch_num=int(patch_num)) 62 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))]) 63 | 64 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 65 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 66 | answers_file = os.path.expanduser(args.answers_file) 67 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 68 | ans_file = open(answers_file, "w") 69 | for line in tqdm(questions): 70 | idx = line["question_id"] 71 | image_file = line["image"] 72 | qs = line["text"] 73 | cur_prompt = qs 74 | if model.config.mm_use_im_start_end: 75 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 76 | else: 77 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 78 | 79 | conv = conv_templates[args.conv_mode].copy() 80 | conv.append_message(conv.roles[0], qs) 81 | conv.append_message(conv.roles[1], None) 82 | prompt = conv.get_prompt() 83 | 84 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 85 | 86 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 87 | if model.config.image_aspect_ratio == 'slice': 88 | image = preprocess(image) 89 | image = image.unsqueeze(0) 90 | h, w = image.shape[-2:] 91 | block_size = 336 92 | h_block, w_block = image_patch.calculate(h, w) 93 | h_ratio = block_size*h_block/h 94 | w_ratio = block_size*w_block/w 95 | if h_ratio<=w_ratio: 96 | w_ = min(block_size*w_block, round(w*h_ratio)) 97 | h_ = block_size*h_block 98 | else: 99 | w_ = block_size*w_block 100 | h_ = min(block_size*h_block, round(h*w_ratio)) 101 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 102 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device) 103 | image[:, :, :h_, :w_] = image_inter 104 | 105 | split_images = [] 106 | for i_ in range(h_block): 107 | for j_ in range(w_block): 108 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)] 109 | split_images.append(image_s) 110 | if len(split_images)>1: 111 | h_ratio = block_size/h 112 | w_ratio = block_size/w 113 | if h_ratio<=w_ratio: 114 | w_ = min(block_size, round(w*h_ratio)) 115 | h_ = block_size 116 | else: 117 | w_ = block_size 118 | h_ = min(block_size, round(h*w_ratio)) 119 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 120 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device) 121 | image_s[:, :, :h_, :w_] = image_inter 122 | split_images.append(image_s) 123 | image_tensor = torch.cat(split_images, dim=0) 124 | else: 125 | image_tensor = process_images([image], image_processor, model.config)[0] 126 | image_tensor = image_tensor.unsqueeze(0) 127 | h_block = 1 128 | w_block = 1 129 | 130 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 131 | keywords = [stop_str] 132 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 133 | mode = model.config.image_aspect_ratio 134 | 135 | with torch.inference_mode(): 136 | model.orig_forward = model.forward 137 | model.forward = partial(model.orig_forward, 138 | mode=mode, 139 | h_block=[h_block], 140 | w_block=[w_block] 141 | ) 142 | 143 | 144 | output_ids = model.generate( 145 | input_ids, 146 | images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True), 147 | do_sample=True if args.temperature > 0 else False, 148 | temperature=args.temperature, 149 | top_p=args.top_p, 150 | num_beams=args.num_beams, 151 | # no_repeat_ngram_size=3, 152 | max_new_tokens=1024, 153 | use_cache=True) 154 | 155 | model.forward = model.orig_forward 156 | 157 | input_token_len = input_ids.shape[1] 158 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 159 | if n_diff_input_output > 0: 160 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 161 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 162 | outputs = outputs.strip() 163 | if outputs.endswith(stop_str): 164 | outputs = outputs[:-len(stop_str)] 165 | outputs = outputs.strip() 166 | 167 | ans_id = shortuuid.uuid() 168 | ans_file.write(json.dumps({"question_id": idx, 169 | "prompt": cur_prompt, 170 | "text": outputs, 171 | "answer_id": ans_id, 172 | "model_id": model_name, 173 | "metadata": {}}) + "\n") 174 | ans_file.flush() 175 | ans_file.close() 176 | 177 | if __name__ == "__main__": 178 | parser = argparse.ArgumentParser() 179 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 180 | parser.add_argument("--model-base", type=str, default=None) 181 | parser.add_argument("--image-folder", type=str, default="") 182 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 183 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 184 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 185 | parser.add_argument("--num-chunks", type=int, default=1) 186 | parser.add_argument("--chunk-idx", type=int, default=0) 187 | parser.add_argument("--temperature", type=float, default=0.2) 188 | parser.add_argument("--top_p", type=float, default=None) 189 | parser.add_argument("--num_beams", type=int, default=1) 190 | args = parser.parse_args() 191 | 192 | eval_model(args) 193 | -------------------------------------------------------------------------------- /llava/eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 5 | from llava.conversation import conv_templates, SeparatorStyle 6 | from llava.model.builder import load_pretrained_model 7 | from llava.utils import disable_torch_init 8 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 9 | 10 | from PIL import Image 11 | 12 | import requests 13 | from PIL import Image 14 | from io import BytesIO 15 | 16 | 17 | def load_image(image_file): 18 | if image_file.startswith('http') or image_file.startswith('https'): 19 | response = requests.get(image_file) 20 | image = Image.open(BytesIO(response.content)).convert('RGB') 21 | else: 22 | image = Image.open(image_file).convert('RGB') 23 | return image 24 | 25 | 26 | def eval_model(args): 27 | # Model 28 | disable_torch_init() 29 | 30 | model_name = get_model_name_from_path(args.model_path) 31 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name) 32 | 33 | qs = args.query 34 | if model.config.mm_use_im_start_end: 35 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 36 | else: 37 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 38 | 39 | if 'llama-2' in model_name.lower(): 40 | conv_mode = "llava_llama_2" 41 | elif "v1" in model_name.lower(): 42 | conv_mode = "llava_v1" 43 | elif "mpt" in model_name.lower(): 44 | conv_mode = "mpt" 45 | else: 46 | conv_mode = "llava_v0" 47 | 48 | if args.conv_mode is not None and conv_mode != args.conv_mode: 49 | print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) 50 | else: 51 | args.conv_mode = conv_mode 52 | 53 | conv = conv_templates[args.conv_mode].copy() 54 | conv.append_message(conv.roles[0], qs) 55 | conv.append_message(conv.roles[1], None) 56 | prompt = conv.get_prompt() 57 | 58 | image = load_image(args.image_file) 59 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda() 60 | 61 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 62 | 63 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 64 | keywords = [stop_str] 65 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 66 | 67 | with torch.inference_mode(): 68 | output_ids = model.generate( 69 | input_ids, 70 | images=image_tensor, 71 | do_sample=True, 72 | temperature=0.2, 73 | max_new_tokens=1024, 74 | use_cache=True, 75 | stopping_criteria=[stopping_criteria]) 76 | 77 | input_token_len = input_ids.shape[1] 78 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 79 | if n_diff_input_output > 0: 80 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 81 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 82 | outputs = outputs.strip() 83 | if outputs.endswith(stop_str): 84 | outputs = outputs[:-len(stop_str)] 85 | outputs = outputs.strip() 86 | print(outputs) 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 91 | parser.add_argument("--model-base", type=str, default=None) 92 | parser.add_argument("--image-file", type=str, required=True) 93 | parser.add_argument("--query", type=str, required=True) 94 | parser.add_argument("--conv-mode", type=str, default=None) 95 | args = parser.parse_args() 96 | 97 | eval_model(args) 98 | -------------------------------------------------------------------------------- /llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /llava/mm_utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from io import BytesIO 3 | import base64 4 | 5 | import torch 6 | from transformers import StoppingCriteria 7 | from llava.constants import IMAGE_TOKEN_INDEX 8 | 9 | 10 | def load_image_from_base64(image): 11 | return Image.open(BytesIO(base64.b64decode(image))) 12 | 13 | 14 | def expand2square(pil_img, background_color): 15 | width, height = pil_img.size 16 | if width == height: 17 | return pil_img 18 | elif width > height: 19 | result = Image.new(pil_img.mode, (width, width), background_color) 20 | result.paste(pil_img, (0, (width - height) // 2)) 21 | return result 22 | else: 23 | result = Image.new(pil_img.mode, (height, height), background_color) 24 | result.paste(pil_img, ((height - width) // 2, 0)) 25 | return result 26 | 27 | 28 | def process_images(images, image_processor, model_cfg): 29 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 30 | new_images = [] 31 | if image_aspect_ratio == 'pad': 32 | for image in images: 33 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 34 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 35 | new_images.append(image) 36 | else: 37 | return image_processor(images, return_tensors='pt')['pixel_values'] 38 | if all(x.shape == new_images[0].shape for x in new_images): 39 | new_images = torch.stack(new_images, dim=0) 40 | return new_images 41 | 42 | 43 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 44 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 45 | 46 | def insert_separator(X, sep): 47 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 48 | 49 | input_ids = [] 50 | offset = 0 51 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 52 | offset = 1 53 | input_ids.append(prompt_chunks[0][0]) 54 | 55 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 56 | input_ids.extend(x[offset:]) 57 | 58 | if return_tensors is not None: 59 | if return_tensors == 'pt': 60 | return torch.tensor(input_ids, dtype=torch.long) 61 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 62 | return input_ids 63 | 64 | 65 | def get_model_name_from_path(model_path): 66 | model_path = model_path.strip("/") 67 | model_paths = model_path.split("/") 68 | if model_paths[-1].startswith('checkpoint-'): 69 | return model_paths[-2] + "_" + model_paths[-1] 70 | else: 71 | return model_paths[-1] 72 | 73 | 74 | 75 | 76 | class KeywordsStoppingCriteria(StoppingCriteria): 77 | def __init__(self, keywords, tokenizer, input_ids): 78 | self.keywords = keywords 79 | self.keyword_ids = [] 80 | self.max_keyword_len = 0 81 | for keyword in keywords: 82 | cur_keyword_ids = tokenizer(keyword).input_ids 83 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 84 | cur_keyword_ids = cur_keyword_ids[1:] 85 | if len(cur_keyword_ids) > self.max_keyword_len: 86 | self.max_keyword_len = len(cur_keyword_ids) 87 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 88 | self.tokenizer = tokenizer 89 | self.start_len = input_ids.shape[1] 90 | 91 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 92 | assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO 93 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 94 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 95 | for keyword_id in self.keyword_ids: 96 | if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all(): 97 | return True 98 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 99 | for keyword in self.keywords: 100 | if keyword in outputs: 101 | return True 102 | return False -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig 3 | -------------------------------------------------------------------------------- /llava/model/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import warnings 18 | import shutil 19 | 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 21 | import torch 22 | from llava.model import * 23 | from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 24 | 25 | 26 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"): 27 | kwargs = {"device_map": device_map} 28 | 29 | if load_8bit: 30 | kwargs['load_in_8bit'] = True 31 | elif load_4bit: 32 | kwargs['load_in_4bit'] = True 33 | kwargs['quantization_config'] = BitsAndBytesConfig( 34 | load_in_4bit=True, 35 | bnb_4bit_compute_dtype=torch.float16, 36 | bnb_4bit_use_double_quant=True, 37 | bnb_4bit_quant_type='nf4' 38 | ) 39 | else: 40 | kwargs['torch_dtype'] = torch.float16 41 | 42 | if 'llava' in model_name.lower(): 43 | # Load LLaVA model 44 | if 'lora' in model_name.lower() and model_base is None: 45 | warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.') 46 | if 'lora' in model_name.lower() and model_base is not None: 47 | lora_cfg_pretrained = AutoConfig.from_pretrained(model_path) 48 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 49 | print('Loading LLaVA from base model...') 50 | model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) 51 | token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features 52 | if model.lm_head.weight.shape[0] != token_num: 53 | model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 54 | model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 55 | 56 | print('Loading additional LLaVA weights...') 57 | if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): 58 | non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') 59 | else: 60 | # this is probably from HF Hub 61 | from huggingface_hub import hf_hub_download 62 | def load_from_hf(repo_id, filename, subfolder=None): 63 | cache_file = hf_hub_download( 64 | repo_id=repo_id, 65 | filename=filename, 66 | subfolder=subfolder) 67 | return torch.load(cache_file, map_location='cpu') 68 | non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') 69 | non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} 70 | if any(k.startswith('model.model.') for k in non_lora_trainables): 71 | non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} 72 | model.load_state_dict(non_lora_trainables, strict=False) 73 | 74 | from peft import PeftModel 75 | print('Loading LoRA weights...') 76 | model = PeftModel.from_pretrained(model, model_path) 77 | print('Merging LoRA weights...') 78 | model = model.merge_and_unload() 79 | print('Model is loaded...') 80 | elif model_base is not None: 81 | # this may be mm projector only 82 | print('Loading LLaVA from base model...') 83 | if 'mpt' in model_name.lower(): 84 | if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): 85 | shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py')) 86 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True) 87 | cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 88 | model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 89 | else: 90 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 91 | cfg_pretrained = AutoConfig.from_pretrained(model_path) 92 | model = LlavaLlamaForCausalLM.from_pretrained( 93 | model_base, 94 | # torch_dtype=torch.bfloat16, 95 | ).cuda() 96 | # model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 97 | 98 | mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') 99 | mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} 100 | model.load_state_dict(mm_projector_weights, strict=False) 101 | else: 102 | if 'mpt' in model_name.lower(): 103 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 104 | model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 105 | else: 106 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 107 | model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 108 | else: 109 | # Load language model 110 | if model_base is not None: 111 | # PEFT model 112 | from peft import PeftModel 113 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 114 | model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") 115 | print(f"Loading LoRA weights from {model_path}") 116 | model = PeftModel.from_pretrained(model, model_path) 117 | print(f"Merging weights") 118 | model = model.merge_and_unload() 119 | print('Convert to FP16...') 120 | model.to(torch.float16) 121 | else: 122 | use_fast = False 123 | if 'mpt' in model_name.lower(): 124 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 125 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) 126 | else: 127 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 128 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 129 | 130 | image_processor = None 131 | 132 | if 'llava' in model_name.lower(): 133 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 134 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) 135 | if mm_use_im_patch_token: 136 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 137 | if mm_use_im_start_end: 138 | tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) 139 | model.resize_token_embeddings(len(tokenizer)) 140 | 141 | vision_tower = model.get_vision_tower() 142 | if not vision_tower.is_loaded: 143 | vision_tower.load_model() 144 | vision_tower.to(device=device, dtype=torch.float16) 145 | image_processor = vision_tower.image_processor 146 | 147 | if hasattr(model.config, "max_sequence_length"): 148 | context_len = model.config.max_sequence_length 149 | else: 150 | context_len = 2048 151 | 152 | return tokenizer, model, image_processor, context_len 153 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | from torch.nn import CrossEntropyLoss 21 | 22 | from transformers import AutoConfig, AutoModelForCausalLM, \ 23 | LlamaConfig, LlamaModel, LlamaForCausalLM 24 | 25 | from transformers.modeling_outputs import CausalLMOutputWithPast 26 | 27 | from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaConfig(LlamaConfig): 31 | model_type = "llava" 32 | 33 | 34 | class LlavaLlamaModel(LlavaMetaModel, LlamaModel): 35 | config_class = LlavaConfig 36 | 37 | def __init__(self, config: LlamaConfig): 38 | super(LlavaLlamaModel, self).__init__(config) 39 | 40 | 41 | class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM): 42 | config_class = LlavaConfig 43 | 44 | def __init__(self, config): 45 | super(LlamaForCausalLM, self).__init__(config) 46 | self.model = LlavaLlamaModel(config) 47 | 48 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.model 55 | 56 | def forward( 57 | self, 58 | input_ids: torch.LongTensor = None, 59 | attention_mask: Optional[torch.Tensor] = None, 60 | past_key_values: Optional[List[torch.FloatTensor]] = None, 61 | inputs_embeds: Optional[torch.FloatTensor] = None, 62 | labels: Optional[torch.LongTensor] = None, 63 | use_cache: Optional[bool] = None, 64 | output_attentions: Optional[bool] = None, 65 | output_hidden_states: Optional[bool] = None, 66 | images: Optional[torch.FloatTensor] = None, 67 | return_dict: Optional[bool] = None, 68 | mode = None, 69 | h_block = None, 70 | w_block = None 71 | ) -> Union[Tuple, CausalLMOutputWithPast]: 72 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions 73 | output_hidden_states = ( 74 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states 75 | ) 76 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 77 | 78 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, mode, h_block, w_block) 79 | 80 | # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) 81 | outputs = self.model( 82 | input_ids=input_ids, 83 | attention_mask=attention_mask, 84 | past_key_values=past_key_values, 85 | inputs_embeds=inputs_embeds, 86 | use_cache=use_cache, 87 | output_attentions=output_attentions, 88 | output_hidden_states=output_hidden_states, 89 | return_dict=return_dict 90 | ) 91 | 92 | hidden_states = outputs[0] 93 | logits = self.lm_head(hidden_states) 94 | 95 | loss = None 96 | if labels is not None: 97 | # Shift so that tokens < n predict n 98 | shift_logits = logits[..., :-1, :].contiguous() 99 | shift_labels = labels[..., 1:].contiguous() 100 | # Flatten the tokens 101 | loss_fct = CrossEntropyLoss() 102 | shift_logits = shift_logits.view(-1, self.config.vocab_size) 103 | shift_labels = shift_labels.view(-1) 104 | # Enable model/pipeline parallelism 105 | shift_labels = shift_labels.to(shift_logits.device) 106 | loss = loss_fct(shift_logits, shift_labels) 107 | 108 | if not return_dict: 109 | output = (logits,) + outputs[1:] 110 | return (loss,) + output if loss is not None else output 111 | 112 | return CausalLMOutputWithPast( 113 | loss=loss, 114 | logits=logits, 115 | past_key_values=outputs.past_key_values, 116 | hidden_states=outputs.hidden_states, 117 | attentions=outputs.attentions, 118 | ) 119 | 120 | def prepare_inputs_for_generation( 121 | self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs 122 | ): 123 | if past_key_values: 124 | input_ids = input_ids[:, -1:] 125 | 126 | # if `inputs_embeds` are passed, we only want to use them in the 1st generation step 127 | if inputs_embeds is not None and past_key_values is None: 128 | model_inputs = {"inputs_embeds": inputs_embeds} 129 | else: 130 | model_inputs = {"input_ids": input_ids} 131 | 132 | model_inputs.update( 133 | { 134 | "past_key_values": past_key_values, 135 | "use_cache": kwargs.get("use_cache"), 136 | "attention_mask": attention_mask, 137 | "images": kwargs.get("images", None), 138 | } 139 | ) 140 | return model_inputs 141 | 142 | AutoConfig.register("llava", LlavaConfig) 143 | AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) 144 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple 17 | import warnings 18 | 19 | import torch 20 | import torch.nn.functional as F 21 | import math 22 | 23 | from transformers import AutoConfig, AutoModelForCausalLM 24 | from transformers.modeling_outputs import CausalLMOutputWithPast 25 | 26 | from .mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel 27 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 28 | 29 | 30 | class LlavaMPTConfig(MPTConfig): 31 | model_type = "llava_mpt" 32 | 33 | 34 | class LlavaMPTModel(LlavaMetaModel, MPTModel): 35 | config_class = LlavaMPTConfig 36 | 37 | def __init__(self, config: MPTConfig): 38 | config.hidden_size = config.d_model 39 | super(LlavaMPTModel, self).__init__(config) 40 | 41 | def embed_tokens(self, x): 42 | return self.wte(x) 43 | 44 | 45 | class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM): 46 | config_class = LlavaMPTConfig 47 | supports_gradient_checkpointing = True 48 | 49 | def __init__(self, config): 50 | super(MPTForCausalLM, self).__init__(config) 51 | 52 | if not config.tie_word_embeddings: 53 | raise ValueError('MPTForCausalLM only supports tied word embeddings') 54 | self.transformer = LlavaMPTModel(config) 55 | self.logit_scale = None 56 | if config.logit_scale is not None: 57 | logit_scale = config.logit_scale 58 | if isinstance(logit_scale, str): 59 | if logit_scale == 'inv_sqrt_d_model': 60 | logit_scale = 1 / math.sqrt(config.d_model) 61 | else: 62 | raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.") 63 | self.logit_scale = logit_scale 64 | 65 | def get_model(self): 66 | return self.transformer 67 | 68 | def _set_gradient_checkpointing(self, module, value=False): 69 | if isinstance(module, LlavaMPTModel): 70 | module.gradient_checkpointing = value 71 | 72 | def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None): 73 | return_dict = return_dict if return_dict is not None else self.config.return_dict 74 | use_cache = use_cache if use_cache is not None else self.config.use_cache 75 | 76 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 77 | outputs = self.transformer(input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache) 78 | # FIXME: this is a hack to fix the multiple gpu inference issue in https://github.com/haotian-liu/LLaVA/issues/338 79 | logits = F.linear(outputs.last_hidden_state.to(self.transformer.wte.weight.device), self.transformer.wte.weight) 80 | if self.logit_scale is not None: 81 | if self.logit_scale == 0: 82 | warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.') 83 | logits *= self.logit_scale 84 | loss = None 85 | if labels is not None: 86 | labels = torch.roll(labels, shifts=-1) 87 | labels[:, -1] = -100 88 | loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)) 89 | return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states) 90 | 91 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 92 | if inputs_embeds is not None: 93 | raise NotImplementedError('inputs_embeds is not implemented for MPT yet') 94 | attention_mask = kwargs['attention_mask'].bool() 95 | if attention_mask[:, -1].sum() != attention_mask.shape[0]: 96 | raise NotImplementedError('MPT does not support generation with right padding.') 97 | if self.transformer.attn_uses_sequence_id and self.training: 98 | sequence_id = torch.zeros_like(input_ids[:1]) 99 | else: 100 | sequence_id = None 101 | if past_key_values is not None: 102 | input_ids = input_ids[:, -1].unsqueeze(-1) 103 | if self.transformer.prefix_lm: 104 | prefix_mask = torch.ones_like(attention_mask) 105 | if kwargs.get('use_cache') == False: 106 | raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.') 107 | else: 108 | prefix_mask = None 109 | return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)} 110 | 111 | 112 | AutoConfig.register("llava_mpt", LlavaMPTConfig) 113 | AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM) 114 | -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/adapt_tokenizer.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/attention.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/blocks.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/configuration_mpt.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/custom_embedding.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/flash_attn_triton.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/hf_prefixlm_converter.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/meta_init_context.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/modeling_mpt.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/norm.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/model/language_model/mpt/__pycache__/param_init_fns.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/mpt/adapt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast 3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] 4 | NUM_SENTINEL_TOKENS: int = 100 5 | 6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer): 7 | """Adds sentinel tokens and padding token (if missing). 8 | 9 | Expands the tokenizer vocabulary to include sentinel tokens 10 | used in mixture-of-denoiser tasks as well as a padding token. 11 | 12 | All added tokens are added as special tokens. No tokens are 13 | added if sentinel tokens and padding token already exist. 14 | """ 15 | sentinels_to_add = [f'' for i in range(NUM_SENTINEL_TOKENS)] 16 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True) 17 | if tokenizer.pad_token is None: 18 | tokenizer.add_tokens('', special_tokens=True) 19 | tokenizer.pad_token = '' 20 | assert tokenizer.pad_token_id is not None 21 | sentinels = ''.join([f'' for i in range(NUM_SENTINEL_TOKENS)]) 22 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids 23 | tokenizer.sentinel_token_ids = _sentinel_token_ids 24 | 25 | class AutoTokenizerForMOD(AutoTokenizer): 26 | """AutoTokenizer + Adaptation for MOD. 27 | 28 | A simple wrapper around AutoTokenizer to make instantiating 29 | an MOD-adapted tokenizer a bit easier. 30 | 31 | MOD-adapted tokenizers have sentinel tokens (e.g., ), 32 | a padding token, and a property to get the token ids of the 33 | sentinel tokens. 34 | """ 35 | 36 | @classmethod 37 | def from_pretrained(cls, *args, **kwargs): 38 | """See `AutoTokenizer.from_pretrained` docstring.""" 39 | tokenizer = super().from_pretrained(*args, **kwargs) 40 | adapt_tokenizer_for_denoising(tokenizer) 41 | return tokenizer -------------------------------------------------------------------------------- /llava/model/language_model/mpt/blocks.py: -------------------------------------------------------------------------------- 1 | """GPT Blocks used for the GPT Model.""" 2 | from typing import Dict, Optional, Tuple 3 | import torch 4 | import torch.nn as nn 5 | from .attention import ATTN_CLASS_REGISTRY 6 | from .norm import NORM_CLASS_REGISTRY 7 | 8 | class MPTMLP(nn.Module): 9 | 10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None): 11 | super().__init__() 12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device) 13 | self.act = nn.GELU(approximate='none') 14 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device) 15 | self.down_proj._is_residual = True 16 | 17 | def forward(self, x): 18 | return self.down_proj(self.act(self.up_proj(x))) 19 | 20 | class MPTBlock(nn.Module): 21 | 22 | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs): 23 | del kwargs 24 | super().__init__() 25 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] 26 | attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']] 27 | self.norm_1 = norm_class(d_model, device=device) 28 | self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device) 29 | self.norm_2 = norm_class(d_model, device=device) 30 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device) 31 | self.resid_attn_dropout = nn.Dropout(resid_pdrop) 32 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop) 33 | 34 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: 35 | a = self.norm_1(x) 36 | (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal) 37 | x = x + self.resid_attn_dropout(b) 38 | m = self.norm_2(x) 39 | n = self.ffn(m) 40 | x = x + self.resid_ffn_dropout(n) 41 | return (x, attn_weights, past_key_value) -------------------------------------------------------------------------------- /llava/model/language_model/mpt/configuration_mpt.py: -------------------------------------------------------------------------------- 1 | """A HuggingFace-style model configuration.""" 2 | from typing import Dict, Optional, Union 3 | from transformers import PretrainedConfig 4 | attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8} 5 | init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0} 6 | 7 | class MPTConfig(PretrainedConfig): 8 | model_type = 'mpt' 9 | 10 | def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs): 11 | """The MPT configuration class. 12 | 13 | Args: 14 | d_model (int): The size of the embedding dimension of the model. 15 | n_heads (int): The number of attention heads. 16 | n_layers (int): The number of layers in the model. 17 | expansion_ratio (int): The ratio of the up/down scale in the MLP. 18 | max_seq_len (int): The maximum sequence length of the model. 19 | vocab_size (int): The size of the vocabulary. 20 | resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. 21 | emb_pdrop (float): The dropout probability for the embedding layer. 22 | learned_pos_emb (bool): Whether to use learned positional embeddings 23 | attn_config (Dict): A dictionary used to configure the model's attention module: 24 | attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention 25 | attn_pdrop (float): The dropout probability for the attention layers. 26 | attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. 27 | qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. 28 | clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to 29 | this value. 30 | softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, 31 | use the default scale of ``1/sqrt(d_keys)``. 32 | prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an 33 | extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix 34 | can attend to one another bi-directionally. Tokens outside the prefix use causal attention. 35 | attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. 36 | When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates 37 | which sub-sequence each token belongs to. 38 | Defaults to ``False`` meaning any provided `sequence_id` will be ignored. 39 | alibi (bool): Whether to use the alibi bias instead of position embeddings. 40 | alibi_bias_max (int): The maximum value of the alibi bias. 41 | init_device (str): The device to use for parameter initialization. 42 | logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. 43 | no_bias (bool): Whether to use bias in all layers. 44 | verbose (int): The verbosity level. 0 is silent. 45 | embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. 46 | norm_type (str): choose type of norm to use 47 | multiquery_attention (bool): Whether to use multiquery attention implementation. 48 | use_cache (bool): Whether or not the model should return the last key/values attentions 49 | init_config (Dict): A dictionary used to configure the model initialization: 50 | init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', 51 | 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or 52 | 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. 53 | init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. 54 | emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. 55 | emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution 56 | used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. 57 | init_std (float): The standard deviation of the normal distribution used to initialize the model, 58 | if using the baseline_ parameter initialization scheme. 59 | init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. 60 | fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. 61 | init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. 62 | --- 63 | See llmfoundry.models.utils.param_init_fns.py for info on other param init config options 64 | """ 65 | self.d_model = d_model 66 | self.n_heads = n_heads 67 | self.n_layers = n_layers 68 | self.expansion_ratio = expansion_ratio 69 | self.max_seq_len = max_seq_len 70 | self.vocab_size = vocab_size 71 | self.resid_pdrop = resid_pdrop 72 | self.emb_pdrop = emb_pdrop 73 | self.learned_pos_emb = learned_pos_emb 74 | self.attn_config = attn_config 75 | self.init_device = init_device 76 | self.logit_scale = logit_scale 77 | self.no_bias = no_bias 78 | self.verbose = verbose 79 | self.embedding_fraction = embedding_fraction 80 | self.norm_type = norm_type 81 | self.use_cache = use_cache 82 | self.init_config = init_config 83 | if 'name' in kwargs: 84 | del kwargs['name'] 85 | if 'loss_fn' in kwargs: 86 | del kwargs['loss_fn'] 87 | super().__init__(**kwargs) 88 | self._validate_config() 89 | 90 | def _set_config_defaults(self, config, config_defaults): 91 | for (k, v) in config_defaults.items(): 92 | if k not in config: 93 | config[k] = v 94 | return config 95 | 96 | def _validate_config(self): 97 | self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults) 98 | self.init_config = self._set_config_defaults(self.init_config, init_config_defaults) 99 | if self.d_model % self.n_heads != 0: 100 | raise ValueError('d_model must be divisible by n_heads') 101 | if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])): 102 | raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1") 103 | if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: 104 | raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}") 105 | if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: 106 | raise NotImplementedError('prefix_lm only implemented with torch and triton attention.') 107 | if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: 108 | raise NotImplementedError('alibi only implemented with torch and triton attention.') 109 | if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']: 110 | raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.') 111 | if self.embedding_fraction > 1 or self.embedding_fraction <= 0: 112 | raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!') 113 | if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model': 114 | raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.") 115 | if self.init_config.get('name', None) is None: 116 | raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.") 117 | if not self.learned_pos_emb and (not self.attn_config['alibi']): 118 | raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.') -------------------------------------------------------------------------------- /llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /llava/model/language_model/mpt/meta_init_context.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import torch 3 | import torch.nn as nn 4 | 5 | @contextmanager 6 | def init_empty_weights(include_buffers: bool=False): 7 | """Meta initialization context manager. 8 | 9 | A context manager under which models are initialized with all parameters 10 | on the meta device, therefore creating an empty model. Useful when just 11 | initializing the model would blow the available RAM. 12 | 13 | Args: 14 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or 15 | not to also put all buffers on the meta device while initializing. 16 | 17 | Example: 18 | ```python 19 | import torch.nn as nn 20 | 21 | # Initialize a model with 100 billions parameters in no time and without using any RAM. 22 | with init_empty_weights(): 23 | tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)]) 24 | ``` 25 | 26 | 27 | 28 | Any model created under this context manager has no weights. As such you can't do something like 29 | `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`]. 30 | 31 | 32 | """ 33 | with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f: 34 | yield f 35 | 36 | @contextmanager 37 | def init_on_device(device: torch.device, include_buffers: bool=False): 38 | """Device initialization context manager. 39 | 40 | A context manager under which models are initialized with all parameters 41 | on the specified device. 42 | 43 | Args: 44 | device (`torch.device`): Device to initialize all parameters on. 45 | include_buffers (`bool`, *optional*, defaults to `False`): Whether or 46 | not to also put all buffers on the meta device while initializing. 47 | 48 | Example: 49 | ```python 50 | import torch.nn as nn 51 | 52 | with init_on_device(device=torch.device("cuda")): 53 | tst = nn.Liner(100, 100) # on `cuda` device 54 | ``` 55 | """ 56 | old_register_parameter = nn.Module.register_parameter 57 | if include_buffers: 58 | old_register_buffer = nn.Module.register_buffer 59 | 60 | def register_empty_parameter(module, name, param): 61 | old_register_parameter(module, name, param) 62 | if param is not None: 63 | param_cls = type(module._parameters[name]) 64 | kwargs = module._parameters[name].__dict__ 65 | module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) 66 | 67 | def register_empty_buffer(module, name, buffer): 68 | old_register_buffer(module, name, buffer) 69 | if buffer is not None: 70 | module._buffers[name] = module._buffers[name].to(device) 71 | if include_buffers: 72 | tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']} 73 | else: 74 | tensor_constructors_to_patch = {} 75 | 76 | def patch_tensor_constructor(fn): 77 | 78 | def wrapper(*args, **kwargs): 79 | kwargs['device'] = device 80 | return fn(*args, **kwargs) 81 | return wrapper 82 | try: 83 | nn.Module.register_parameter = register_empty_parameter 84 | if include_buffers: 85 | nn.Module.register_buffer = register_empty_buffer 86 | for torch_function_name in tensor_constructors_to_patch.keys(): 87 | setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name))) 88 | yield 89 | finally: 90 | nn.Module.register_parameter = old_register_parameter 91 | if include_buffers: 92 | nn.Module.register_buffer = old_register_buffer 93 | for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items(): 94 | setattr(torch, torch_function_name, old_torch_function) -------------------------------------------------------------------------------- /llava/model/language_model/mpt/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def _cast_if_autocast_enabled(tensor): 4 | if torch.is_autocast_enabled(): 5 | if tensor.device.type == 'cuda': 6 | dtype = torch.get_autocast_gpu_dtype() 7 | elif tensor.device.type == 'cpu': 8 | dtype = torch.get_autocast_cpu_dtype() 9 | else: 10 | raise NotImplementedError() 11 | return tensor.to(dtype=dtype) 12 | return tensor 13 | 14 | class LPLayerNorm(torch.nn.LayerNorm): 15 | 16 | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None): 17 | super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype) 18 | 19 | def forward(self, x): 20 | module_device = x.device 21 | downcast_x = _cast_if_autocast_enabled(x) 22 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 23 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias 24 | with torch.autocast(enabled=False, device_type=module_device.type): 25 | return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps) 26 | 27 | def rms_norm(x, weight=None, eps=1e-05): 28 | output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) 29 | if weight is not None: 30 | return output * weight 31 | return output 32 | 33 | class RMSNorm(torch.nn.Module): 34 | 35 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 36 | super().__init__() 37 | self.eps = eps 38 | if weight: 39 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device)) 40 | else: 41 | self.register_parameter('weight', None) 42 | 43 | def forward(self, x): 44 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype) 45 | 46 | class LPRMSNorm(RMSNorm): 47 | 48 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 49 | super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device) 50 | 51 | def forward(self, x): 52 | downcast_x = _cast_if_autocast_enabled(x) 53 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 54 | with torch.autocast(enabled=False, device_type=x.device.type): 55 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype) 56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm} -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | self.is_loaded = False 11 | 12 | self.vision_tower_name = vision_tower 13 | self.select_layer = args.mm_vision_select_layer 14 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 15 | 16 | if not delay_load: 17 | self.load_model() 18 | else: 19 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 20 | 21 | def load_model(self): 22 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 23 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 24 | self.vision_tower.requires_grad_(False) 25 | 26 | self.is_loaded = True 27 | 28 | def feature_select(self, image_forward_outs, layers=[12,16,22,23]): 29 | image_feature_list = [] 30 | for l in layers: 31 | image_feature_list.append(image_forward_outs.hidden_states[l]) 32 | image_features_multi = torch.cat(image_feature_list, dim=2) 33 | 34 | image_features = image_forward_outs.hidden_states[self.select_layer] 35 | 36 | if self.select_feature == 'patch': 37 | image_features = image_features[:, 1:] 38 | image_features_multi = image_features_multi[:, 1:] 39 | 40 | elif self.select_feature == 'cls_patch': 41 | image_features = image_features 42 | else: 43 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 44 | return image_features, image_features_multi 45 | 46 | @torch.no_grad() 47 | def forward(self, images): 48 | 49 | if type(images) is list: 50 | image_features = [] 51 | for image in images: 52 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 53 | image_feature, image_feature_multi = self.feature_select(image_forward_out) 54 | 55 | image_features.append(image_feature.to(image.dtype)) 56 | image_features_multi.append(image_feature_multi.to(image.dtype)) 57 | 58 | else: 59 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 60 | image_features, image_features_multi = self.feature_select(image_forward_outs) 61 | 62 | return (image_features.to(images.dtype), image_features_multi.to(images.dtype)) 63 | 64 | @property 65 | def dummy_feature(self): 66 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 67 | 68 | @property 69 | def dtype(self): 70 | return self.vision_tower.dtype 71 | 72 | @property 73 | def device(self): 74 | return self.vision_tower.device 75 | 76 | @property 77 | def config(self): 78 | if self.is_loaded: 79 | return self.vision_tower.config 80 | else: 81 | return self.cfg_only 82 | 83 | @property 84 | def hidden_size(self): 85 | return self.config.hidden_size 86 | 87 | @property 88 | def num_patches(self): 89 | return (self.config.image_size // self.config.patch_size) ** 2 90 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | from functools import partial 5 | import numpy as np 6 | from torch.nn.init import trunc_normal_ 7 | from torch.nn import functional as F 8 | import math 9 | 10 | 11 | class IdentityMap(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def forward(self, x, *args, **kwargs): 16 | return x 17 | 18 | @property 19 | def config(self): 20 | return {"mm_projector_type": 'identity'} 21 | 22 | 23 | class SimpleResBlock(nn.Module): 24 | def __init__(self, channels): 25 | super().__init__() 26 | self.pre_norm = nn.LayerNorm(channels) 27 | 28 | self.proj = nn.Sequential( 29 | nn.Linear(channels, channels), 30 | nn.GELU(), 31 | nn.Linear(channels, channels) 32 | ) 33 | def forward(self, x): 34 | x = self.pre_norm(x) 35 | return x + self.proj(x) 36 | 37 | 38 | 39 | class TokenPacker(nn.Module): 40 | def __init__( 41 | self, 42 | raw_grid=24, 43 | embed_dim=1024, 44 | num_heads=1024//128, 45 | kv_dim=1024, 46 | hidden_size=4096, 47 | scale_factor=2, 48 | norm_layer=partial(nn.LayerNorm, eps=1e-6) 49 | ): 50 | super().__init__() 51 | if raw_grid%scale_factor!=0: 52 | raise ValueError("scale_factor must be divisible by grid size") 53 | self.raw_grid = raw_grid 54 | self.grid_size = raw_grid//scale_factor 55 | self.num_queries = self.grid_size ** 2 56 | self.embed_dim = embed_dim 57 | self.num_heads = num_heads 58 | self.scale_factor = scale_factor 59 | self.q_proj_1 = nn.Linear(kv_dim, embed_dim, bias=False) 60 | 61 | k_modules = [nn.Linear(4096, 1024)] 62 | for _ in range(1,2): 63 | k_modules.append(nn.GELU()) 64 | k_modules.append(nn.Linear(1024, 1024)) 65 | self.k_proj_1 = nn.Sequential(*k_modules) 66 | 67 | v_modules = [nn.Linear(4096, 1024)] 68 | for _ in range(1,2): 69 | v_modules.append(nn.GELU()) 70 | v_modules.append(nn.Linear(1024, 1024)) 71 | self.v_proj_1 = nn.Sequential(*v_modules) 72 | 73 | self.ln_q_1 = norm_layer(embed_dim) 74 | self.ln_k_1 = norm_layer(embed_dim) 75 | self.ln_v_1 = norm_layer(embed_dim) 76 | 77 | self.clip_attn = nn.MultiheadAttention(embed_dim, num_heads) 78 | 79 | modules = [nn.Linear(1024, hidden_size)] 80 | for _ in range(1, 2): 81 | modules.append(nn.GELU()) 82 | modules.append(nn.Linear(hidden_size, hidden_size)) 83 | self.mlp = nn.Sequential(*modules) 84 | 85 | self.apply(self._init_weights) 86 | 87 | def _init_weights(self, m): 88 | if isinstance(m, nn.Linear): 89 | trunc_normal_(m.weight, std=.02) 90 | if isinstance(m, nn.Linear) and m.bias is not None: 91 | nn.init.constant_(m.bias, 0) 92 | elif isinstance(m, nn.LayerNorm): 93 | nn.init.constant_(m.bias, 0) 94 | nn.init.constant_(m.weight, 1.0) 95 | 96 | def divide_feature(self, x, kernel_size, token_num, N, c): 97 | h = w = int(token_num**0.5) 98 | 99 | reshape_x = x.reshape(h, w, N, c).reshape(h//kernel_size, kernel_size, w, N, c) 100 | reshape_x = reshape_x.permute(0,2,1,3,4) 101 | reshape_x = reshape_x.reshape(h//kernel_size, w//kernel_size, kernel_size, kernel_size, N, c) 102 | reshape_x = reshape_x.permute(0,1,3,2,4,5).reshape(h//kernel_size, w//kernel_size, kernel_size*kernel_size, N, c) 103 | reshape_x = reshape_x.permute(2,0,1,3,4).reshape(kernel_size*kernel_size, -1, c) 104 | 105 | return reshape_x 106 | 107 | def forward(self, x, attn_mask=None): 108 | 109 | x_multi = x[1] # mulit-level 110 | x = x[0] # original single-level 111 | 112 | key = self.ln_k_1(self.k_proj_1(x_multi)).permute(1, 0, 2) 113 | value = self.ln_v_1(self.v_proj_1(x_multi)).permute(1, 0, 2) 114 | 115 | token_num, N, c = key.shape 116 | 117 | q = F.interpolate(x.reshape(x.shape[0],self.raw_grid,self.raw_grid,-1).float().permute(0,3,1,2), size=(self.grid_size, self.grid_size), mode='bilinear').permute(0,2,3,1) ## fix 118 | q = q.reshape(q.shape[0], -1, q.shape[-1]).to(x.dtype) 119 | 120 | query = self.ln_q_1(self.q_proj_1(q)).permute(1, 0, 2) 121 | 122 | reshape_query = self.divide_feature(query, 1, self.num_queries, N, c) 123 | reshape_key = self.divide_feature(key, self.scale_factor, token_num, N, c) 124 | reshape_value = self.divide_feature(value, self.scale_factor, token_num, N, value.shape[-1]) 125 | 126 | out = self.clip_attn( 127 | reshape_query, 128 | reshape_key, 129 | reshape_value, 130 | attn_mask=attn_mask)[0] 131 | 132 | x = out 133 | x = x.reshape(self.num_queries, N, -1) 134 | x = x.permute(1, 0, 2) 135 | 136 | x = self.mlp(x) 137 | return x 138 | 139 | def _repeat(self, query, N: int): 140 | return query.unsqueeze(1).repeat(1, N, 1) 141 | 142 | 143 | 144 | def build_vision_projector(config): 145 | return TokenPacker(hidden_size=config.hidden_size, scale_factor=config.scale_factor) 146 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/patch_divide.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision.ops.boxes import box_area 3 | 4 | patches_9=[ 5 | (1,1), 6 | (1,2),(2,1), 7 | (1,3),(3,1), 8 | (2,2),(1,4),(4,1), 9 | (1,5),(5,1), 10 | (1,6),(6,1),(2,3),(3,2), 11 | (1,7),(7,1), 12 | (4,2),(2,4),(1,8),(8,1), 13 | (3,3),(1,9),(9,1) 14 | ] 15 | 16 | patches_16=[ 17 | (1,1), 18 | (1,2),(2,1), 19 | (1,3),(3,1), 20 | (2,2),(1,4),(4,1), 21 | (1,5),(5,1), 22 | (1,6),(6,1),(2,3),(3,2), 23 | (1,7),(7,1), 24 | (4,2),(2,4),(1,8),(8,1), 25 | (3,3),(1,9),(9,1), 26 | (2,5),(5,2), 27 | (2,6),(6,2),(3,4), (4,3), 28 | (2,7),(7,2), 29 | (3,5),(5,3), 30 | (2,8),(8,2),(4,4) 31 | ] 32 | 33 | patches_25=[ 34 | (1,1), 35 | (1,2),(2,1), 36 | (1,3),(3,1), 37 | (2,2),(1,4),(4,1), 38 | (1,5),(5,1), 39 | (1,6),(6,1),(2,3),(3,2), 40 | (1,7),(7,1), 41 | (4,2),(2,4),(1,8),(8,1), 42 | (3,3),(1,9),(9,1), 43 | (2,5),(5,2), 44 | (2,6),(6,2),(3,4), (4,3), 45 | (2,7),(7,2), 46 | (3,5),(5,3), 47 | (2,8),(8,2),(4,4), 48 | (3,6),(6,3),(2,9),(9,2), 49 | (4,5),(5,4),(2,10),(10,2), 50 | (3,7),(7,3), 51 | (11,2),(2,11), 52 | (4,6),(6,4),(12,2),(2,12),(3,8),(8,3),(4,6),(6,4), 53 | (5,5) 54 | ] 55 | 56 | 57 | def box_iou(boxes1, area1, boxes2, eps=1e-5): 58 | area2 = box_area(boxes2) 59 | 60 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 61 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 62 | 63 | wh = (rb - lt).clamp(min=0) # [N,M,2] 64 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 65 | 66 | union = area1[:, None] + area2 - inter 67 | 68 | iou = inter / (union+eps) 69 | return iou, union 70 | 71 | class Image_Patch: 72 | def __init__(self, image_size=336, patch_num=9): 73 | if patch_num == 9: 74 | patches = patches_9 75 | elif patch_num == 16: 76 | patches = patches_16 77 | elif patch_num == 25: 78 | patches = patches_25 79 | else: 80 | raise(NotImplementedError) 81 | 82 | # h,w 83 | if isinstance(image_size, int): 84 | image_size = (image_size, image_size) 85 | self.image_size = image_size 86 | 87 | self.patch_list = patches 88 | 89 | self.patches = torch.tensor( 90 | [[0, 0, _[0]*image_size[0], _[1]*image_size[1]] 91 | for _ in patches], requires_grad=False 92 | ) 93 | 94 | self.patch_areas = box_area(self.patches) 95 | 96 | def calculate(self, h, w): 97 | input_box = torch.tensor([0, 0, h, w]).unsqueeze(0) 98 | ratio = self.patches[:, 2:]/input_box[:, 2:] 99 | ratio = ratio.min(dim=-1)[0] 100 | score = torch.round(h*ratio) * torch.round(w*ratio) / self.patch_areas 101 | iou, _ = box_iou(self.patches, self.patch_areas, input_box*1.4) 102 | iou = iou[:, 0] 103 | score = score + iou*0.1 104 | idx = torch.argmax(score) 105 | return self.patch_list[idx] -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | 5 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 6 | from llava.conversation import conv_templates, SeparatorStyle 7 | from llava.model.builder import load_pretrained_model 8 | from llava.utils import disable_torch_init 9 | from llava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 10 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 11 | from llava.model import * 12 | import torch.nn.functional as F 13 | from functools import partial 14 | from llava.patch_divide import Image_Patch 15 | from torchvision.transforms import Compose, ToTensor, Normalize 16 | 17 | from PIL import Image 18 | 19 | import requests 20 | from PIL import Image 21 | from io import BytesIO 22 | from transformers import TextStreamer 23 | from functools import partial 24 | import time 25 | 26 | def main(args): 27 | # Model 28 | disable_torch_init() 29 | model_path = os.path.expanduser(args.model_path) 30 | model_name = get_model_name_from_path(model_path) 31 | tokenizer = AutoTokenizer.from_pretrained( 32 | args.model_path, 33 | model_max_length = 2048, 34 | padding_side="right", 35 | use_fast = True 36 | ) 37 | model = LlavaLlamaForCausalLM.from_pretrained( 38 | args.model_path, 39 | torch_dtype=torch.bfloat16, 40 | ).cuda() 41 | 42 | for m in model.modules(): 43 | m.tokenizer = tokenizer 44 | 45 | vision_tower = model.get_vision_tower() 46 | if not vision_tower.is_loaded: 47 | vision_tower.load_model() 48 | vision_tower.to(device='cuda', dtype=torch.float16) 49 | image_processor = vision_tower.image_processor 50 | 51 | patch_num = getattr(model.config, 'patch_num', '9') 52 | image_patch = Image_Patch(int(patch_num)) 53 | preprocess = Compose([ToTensor(), Normalize((0.48145466, 0.4578275, 0.40821073),(0.26862954, 0.26130258, 0.27577711))]) 54 | 55 | 56 | while True: 57 | conv = conv_templates[args.conv_mode].copy() 58 | if "mpt" in model_name.lower(): 59 | roles = ('user', 'assistant') 60 | else: 61 | roles = conv.roles 62 | 63 | image_file = input("image file: ") 64 | 65 | image = Image.open(image_file).convert('RGB') 66 | 67 | if model.config.image_aspect_ratio == 'slice': 68 | image = preprocess(image) 69 | image = image.unsqueeze(0) 70 | h, w = image.shape[-2:] 71 | block_size = 336 72 | h_block, w_block = image_patch.calculate(h, w) 73 | h_ratio = block_size*h_block/h 74 | w_ratio = block_size*w_block/w 75 | if h_ratio<=w_ratio: 76 | w_ = min(block_size*w_block, round(w*h_ratio)) 77 | h_ = block_size*h_block 78 | else: 79 | w_ = block_size*w_block 80 | h_ = min(block_size*h_block, round(h*w_ratio)) 81 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 82 | image = torch.zeros((1, 3, block_size*h_block, block_size*w_block)).to(dtype=image_inter.dtype, device=image_inter.device) 83 | image[:, :, :h_, :w_] = image_inter 84 | 85 | split_images = [] 86 | for i_ in range(h_block): 87 | for j_ in range(w_block): 88 | image_s = image[:,:,block_size*i_:block_size*(i_+1), block_size*j_:block_size*(j_+1)] 89 | split_images.append(image_s) 90 | if len(split_images)>1: 91 | h_ratio = block_size/h 92 | w_ratio = block_size/w 93 | if h_ratio<=w_ratio: 94 | w_ = min(block_size, round(w*h_ratio)) 95 | h_ = block_size 96 | else: 97 | w_ = block_size 98 | h_ = min(block_size, round(h*w_ratio)) 99 | image_inter = F.interpolate(image, size=(h_,w_), mode='bilinear') 100 | image_s = torch.zeros((1, 3, block_size, block_size)).to(dtype=image_inter.dtype, device=image_inter.device) 101 | image_s[:, :, :h_, :w_] = image_inter 102 | split_images.append(image_s) 103 | image_tensor = torch.cat(split_images, dim=0) 104 | else: 105 | image_tensor = process_images([image], image_processor, model.config)[0] 106 | image_tensor = image_tensor.unsqueeze(0) 107 | h_block = 1 108 | w_block = 1 109 | 110 | try: 111 | inp = input(f"{roles[0]}: ") 112 | except EOFError: 113 | inp = "" 114 | if not inp: 115 | print("exit...") 116 | break 117 | # inp = "what is in the image?" 118 | 119 | print(f"{roles[1]}: ", end="") 120 | 121 | if image is not None: 122 | if model.config.mm_use_im_start_end: 123 | inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp 124 | else: 125 | inp = DEFAULT_IMAGE_TOKEN + '\n' + inp 126 | conv.append_message(conv.roles[0], inp) 127 | image = None 128 | else: 129 | # later messages 130 | conv.append_message(conv.roles[0], inp) 131 | conv.append_message(conv.roles[1], None) 132 | prompt = conv.get_prompt() 133 | 134 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 135 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 136 | keywords = [stop_str] 137 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 138 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 139 | 140 | mode = model.config.image_aspect_ratio 141 | with torch.inference_mode(): 142 | model.orig_forward = model.forward 143 | model.forward = partial(model.orig_forward, 144 | mode=mode, 145 | h_block=h_block, 146 | w_block=w_block) 147 | start = time.time() 148 | 149 | output_ids = model.generate( 150 | input_ids, 151 | images=image_tensor.to(dtype=torch.bfloat16, device='cuda', non_blocking=True), 152 | do_sample=True, 153 | temperature=args.temperature, 154 | max_new_tokens=args.max_new_tokens, 155 | streamer=streamer, 156 | use_cache=True, 157 | stopping_criteria=[stopping_criteria]) 158 | model.forward = model.orig_forward 159 | 160 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() 161 | end = time.time() 162 | print("***time: ", end-start) 163 | conv.messages[-1][-1] = outputs 164 | 165 | if args.debug: 166 | print("\n", {"prompt": prompt, "outputs": outputs}, "\n") 167 | 168 | 169 | if __name__ == "__main__": 170 | parser = argparse.ArgumentParser() 171 | parser.add_argument("--model-path", type=str, default="path/to/tokenpacker") 172 | parser.add_argument("--device", type=str, default="cuda") 173 | parser.add_argument("--conv-mode", type=str, default='vicuna_v1') 174 | parser.add_argument("--temperature", type=float, default=0.2) 175 | parser.add_argument("--max-new-tokens", type=int, default=512) 176 | parser.add_argument("--load-8bit", action="store_true") 177 | parser.add_argument("--load-4bit", action="store_true") 178 | parser.add_argument("--debug", action="store_true") 179 | args = parser.parse_args() 180 | main(args) 181 | -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CircleRadon/TokenPacker/771a16e7462c3afb908a0c429513ba43f709f2c1/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | import warnings 3 | 4 | import torch 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv 8 | 9 | try: 10 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 11 | except ImportError: 12 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func 13 | from flash_attn.bert_padding import unpad_input, pad_input 14 | 15 | 16 | def forward( 17 | self, 18 | hidden_states: torch.Tensor, 19 | attention_mask: Optional[torch.Tensor] = None, 20 | position_ids: Optional[torch.Tensor] = None, 21 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 22 | output_attentions: bool = False, 23 | use_cache: bool = False, 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 25 | if output_attentions: 26 | warnings.warn( 27 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead." 28 | ) 29 | 30 | bsz, q_len, _ = hidden_states.size() 31 | 32 | query_states = ( 33 | self.q_proj(hidden_states) 34 | .view(bsz, q_len, self.num_heads, self.head_dim) 35 | .transpose(1, 2) 36 | ) 37 | key_states = ( 38 | self.k_proj(hidden_states) 39 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 40 | .transpose(1, 2) 41 | ) 42 | value_states = ( 43 | self.v_proj(hidden_states) 44 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 45 | .transpose(1, 2) 46 | ) # shape: (b, num_heads, s, head_dim) 47 | 48 | kv_seq_len = key_states.shape[-2] 49 | if past_key_value is not None: 50 | kv_seq_len += past_key_value[0].shape[-2] 51 | 52 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 53 | query_states, key_states = apply_rotary_pos_emb( 54 | query_states, key_states, cos, sin, position_ids 55 | ) 56 | 57 | if past_key_value is not None: 58 | # reuse k, v 59 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 60 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 61 | 62 | past_key_value = (key_states, value_states) if use_cache else None 63 | 64 | # repeat k/v heads if n_kv_heads < n_heads 65 | key_states = repeat_kv(key_states, self.num_key_value_groups) 66 | value_states = repeat_kv(value_states, self.num_key_value_groups) 67 | 68 | # Transform the data into the format required by flash attention 69 | qkv = torch.stack([query_states, key_states, value_states], dim=2) 70 | qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim] 71 | key_padding_mask = attention_mask 72 | 73 | if key_padding_mask is None: 74 | qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim) 75 | cu_q_lens = torch.arange( 76 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 77 | ) 78 | max_s = q_len 79 | output = flash_attn_unpadded_qkvpacked_func( 80 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 81 | ) 82 | output = output.view(bsz, q_len, -1) 83 | else: 84 | qkv = qkv.reshape(bsz, q_len, -1) 85 | qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask) 86 | qkv = qkv.view(-1, 3, self.num_heads, self.head_dim) 87 | output_unpad = flash_attn_unpadded_qkvpacked_func( 88 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 89 | ) 90 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) 91 | output = pad_input(output_unpad, indices, bsz, q_len) 92 | 93 | return self.o_proj(output), None, past_key_value 94 | 95 | 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 97 | # requires the attention mask to be the same as the key_padding_mask 98 | def _prepare_decoder_attention_mask( 99 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 100 | ): 101 | # [bsz, seq_len] 102 | return attention_mask 103 | 104 | 105 | def replace_llama_attn_with_flash_attn(): 106 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 107 | if cuda_major < 8: 108 | warnings.warn( 109 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." 110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 111 | ) 112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 113 | _prepare_decoder_attention_mask 114 | ) 115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 116 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | from llava.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | replace_llama_attn_with_flash_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /llava/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from llava.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True) 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.1.3" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0", "transformers==4.31.0", 17 | "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 18 | "gradio==3.35.2", "gradio_client==0.2.9", 19 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 20 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 21 | ] 22 | 23 | [project.optional-dependencies] 24 | train = ["deepspeed==0.9.5", "ninja", "wandb"] 25 | 26 | [project.urls] 27 | "Homepage" = "https://llava-vl.github.io" 28 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 29 | 30 | [tool.setuptools.packages.find] 31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 32 | 33 | [tool.wheel] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | -------------------------------------------------------------------------------- /scripts/convert_docvqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['questionId'] 14 | text = res['answer'].rstrip('.') 15 | all_answers.append({"questionId": question_id, "answer": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | 20 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /scripts/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_80k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_full_schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_158k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 3 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 21 | --version $PROMPT_VERSION \ 22 | --data_path ./playground/data/llava_instruct_80k.json \ 23 | --image_folder /path/to/coco/train2017 \ 24 | --vision_tower openai/clip-vit-large-patch14 \ 25 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 26 | --mm_vision_select_layer -2 \ 27 | --mm_use_im_start_end False \ 28 | --mm_use_im_patch_token False \ 29 | --bf16 True \ 30 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 16 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 50000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --dataloader_num_workers 4 \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/finetune_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --bits 4 \ 21 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 22 | --version $PROMPT_VERSION \ 23 | --data_path ./playground/data/llava_instruct_80k.json \ 24 | --image_folder /path/to/coco/train2017 \ 25 | --vision_tower openai/clip-vit-large-patch14 \ 26 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 27 | --mm_vision_select_layer -2 \ 28 | --mm_use_im_start_end False \ 29 | --mm_use_im_patch_token False \ 30 | --bf16 True \ 31 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 32 | --num_train_epochs 1 \ 33 | --per_device_train_batch_size 16 \ 34 | --per_device_eval_batch_size 4 \ 35 | --gradient_accumulation_steps 1 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 50000 \ 39 | --save_total_limit 1 \ 40 | --learning_rate 2e-5 \ 41 | --weight_decay 0. \ 42 | --warmup_ratio 0.03 \ 43 | --lr_scheduler_type "cosine" \ 44 | --logging_steps 1 \ 45 | --tf32 True \ 46 | --model_max_length 2048 \ 47 | --gradient_checkpointing True \ 48 | --lazy_preprocess True \ 49 | --dataloader_num_workers 4 \ 50 | --report_to wandb 51 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | # MODEL_VERSION=vicuna-v1-3-7b 8 | # MODEL_VERSION=llama-2-7b-chat 9 | 10 | ########### DO NOT CHANGE ########### 11 | ########### USE THIS FOR BOTH ########### 12 | PROMPT_VERSION=plain 13 | ########### DO NOT CHANGE ########### 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 18 | --version $PROMPT_VERSION \ 19 | --data_path /path/to/pretrain_data.json \ 20 | --image_folder /path/to/images \ 21 | --vision_tower openai/clip-vit-large-patch14 \ 22 | --tune_mm_mlp_adapter True \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 16 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 1 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 2e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to wandb 47 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/docvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-tokenpacker-7b" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.eval_docvqa \ 12 | --model-path llava-tokenpacker-7b \ 13 | --question-file ./playground/data/eval/docvqa/data/test_v1.0.json \ 14 | --image-folder /path/to/docvqa/images \ 15 | --answers-file ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=./playground/data/eval/docvqa/answers/$CKPT/merge.jsonl 25 | 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ./playground/data/eval/docvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | python scripts/convert_docvqa_for_eval.py --src $output_file --dst ./playground/data/eval/docvqa/answers/$CKPT/submit.json 35 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-tokenpacker-7b" 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path llava-tokenpacker-7b \ 15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 16 | --image-folder /path/to/gqa/images \ 17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 37 | 38 | cd $GQADIR 39 | python eval/eval.py --tier testdev_balanced 40 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path llava-tokenpacker-7b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-tokenpacker-7b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 19 | --experiment llava-tokenpacker-7b -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NAME=llava-tokenpacker-7b 3 | 4 | python -m llava.eval.model_vqa_loader \ 5 | --model-path llava-tokenpacker-7b \ 6 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 7 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 8 | --answers-file ./playground/data/eval/MME/answers/$NAME.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | cd ./playground/data/eval/MME 13 | 14 | python convert_answer_to_mme.py --experiment $NAME 15 | 16 | cd eval_tool 17 | 18 | python calculation.py --results_dir answers/$NAME 19 | 20 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmmu_val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="llava-tokenpacker-7b" 10 | CONFIG="llava/eval/mmmu/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python llava/eval/mmmu/eval/run_llava.py \ 14 | --data_path /path/to/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path llava-tokenpacker-7b \ 17 | --answers-file ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "validation" \ 21 | --conv-mode vicuna_v1 & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python llava/eval/mmmu/eval/eval.py --result_file $output_file --output_path ./playground/data/eval/MMMU/$CKPT/val.json 37 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path llava-tokenpacker-7b \ 5 | --question-file /path/to/llava-mm-vet.jsonl \ 6 | --image-folder /path/to/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-tokenpacker-7b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-tokenpacker-7b.json 16 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/ocr_bench.sh: -------------------------------------------------------------------------------- 1 | 2 | python -m llava.eval.eval_ocr_bench \ 3 | --model_path llava-tokenpacker-7b \ 4 | --image_folder /path/to/OCR-Bench/OCRBench_Images \ 5 | --output_folder ./playground/data/eval/ocr_bench \ 6 | --OCRBench_file /path/to/OCRBench.json \ 7 | --save_name llava-tokenpacker-7b \ 8 | --temperature 0 \ 9 | --conv_mode vicuna_v1 10 | 11 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | NAME="llava-tokenpacker-7b" 5 | 6 | python -m llava.eval.model_vqa_loader_pope \ 7 | --model-path llava-tokenpacker-7b \ 8 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 9 | --image-folder /path/tp/coco_imgs \ 10 | --answers-file ./playground/data/eval/pope/answers/$NAME.jsonl \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | python llava/eval/eval_pope.py \ 15 | --annotation-dir ./playground/data/eval/pope/coco \ 16 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 17 | --result-file ./playground/data/eval/pope/answers/$NAME.jsonl 18 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.model_vqa_loader \ 4 | --model-path llava-tokenpacker-7b \ 5 | --question-file /path/tp/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder /path/tp/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | CUDA_VISIBLE_DEVICES=0 python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-tokenpacker-7b.jsonl 14 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path llava-tokenpacker-7b\ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder /path/to/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-tokenpacker-7b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-tokenpacker-7b.json 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-tokenpacker-7b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path llava-tokenpacker-7b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder /path/to/VQAv2/test2015/ \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path vicuna-7b-v1.5 \ 6 | --version v1 \ 7 | --data_path /path/to/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./data/llava_mix665k \ 9 | --vision_tower ./clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain/mm_projector.bin \ 11 | --mm_projector_type tokenpacker \ 12 | --scale_factor 2 \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --image_aspect_ratio pad \ 17 | --group_by_modality_length True \ 18 | --bf16 True \ 19 | --output_dir ./checkpoints/llava-tokenpacker \ 20 | --num_train_epochs 1 \ 21 | --per_device_train_batch_size 16 \ 22 | --per_device_eval_batch_size 4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --evaluation_strategy "no" \ 25 | --save_strategy "steps" \ 26 | --save_steps 50000 \ 27 | --save_total_limit 1 \ 28 | --learning_rate 2e-5 \ 29 | --weight_decay 0. \ 30 | --warmup_ratio 0.03 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --tf32 True \ 34 | --model_max_length 2048 \ 35 | --gradient_checkpointing True \ 36 | --dataloader_num_workers 4 \ 37 | --lazy_preprocess True \ 38 | --report_to "none" 39 | 40 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_hd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path vicuna-7b-v1.5 \ 6 | --version v1 \ 7 | --data_path /path/to/mgm_instruction.json \ 8 | --image_folder ./data/MGM-Finetune \ 9 | --vision_tower ./clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-tokenpacker-pretrain-hd/mm_projector.bin \ 11 | --mm_projector_type tokenpacker \ 12 | --patch_num 9 \ 13 | --scale_factor 2 \ 14 | --mm_vision_select_layer -2 \ 15 | --mm_use_im_start_end False \ 16 | --mm_use_im_patch_token False \ 17 | --image_aspect_ratio slice \ 18 | --group_by_modality_length True \ 19 | --bf16 True \ 20 | --output_dir ./checkpoints/llava-tokenpacker-hd \ 21 | --num_train_epochs 1 \ 22 | --per_device_train_batch_size 16 \ 23 | --per_device_eval_batch_size 4 \ 24 | --gradient_accumulation_steps 1 \ 25 | --evaluation_strategy "no" \ 26 | --save_strategy "steps" \ 27 | --save_steps 50000 \ 28 | --save_total_limit 1 \ 29 | --learning_rate 2e-5 \ 30 | --weight_decay 0. \ 31 | --warmup_ratio 0.03 \ 32 | --lr_scheduler_type "cosine" \ 33 | --logging_steps 1 \ 34 | --tf32 True \ 35 | --model_max_length 2048 \ 36 | --gradient_checkpointing True \ 37 | --dataloader_num_workers 4 \ 38 | --lazy_preprocess True \ 39 | --report_to "none" 40 | 41 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path vicuna-7b-v1.5 \ 6 | --version plain \ 7 | --data_path /path/to/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./data/llava_pretrain_558k \ 9 | --vision_tower ./clip-vit-large-patch14-336 \ 10 | --mm_projector_type tokenpacker \ 11 | --scale_factor 2 \ 12 | --tune_mm_mlp_adapter True \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-tokenpacker-pretrain/ \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 32 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 24000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 1e-3 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to "none" 37 | 38 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain_hd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path vicuna-7b-v1.5 \ 6 | --version plain \ 7 | --data_path /path/to/mgm_pretrain.json \ 8 | --image_folder ./data/llava_pretrain_558k \ 9 | --vision_tower ./clip-vit-large-patch14-336 \ 10 | --mm_projector_type tokenpacker \ 11 | --patch_num 9 \ 12 | --scale_factor 2 \ 13 | --tune_mm_mlp_adapter True \ 14 | --mm_vision_select_layer -2 \ 15 | --mm_use_im_start_end False \ 16 | --mm_use_im_patch_token False \ 17 | --image_aspect_ratio slice \ 18 | --bf16 True \ 19 | --output_dir ./checkpoints/llava-tokenpacker-pretrain-hd/ \ 20 | --num_train_epochs 1 \ 21 | --per_device_train_batch_size 32 \ 22 | --per_device_eval_batch_size 4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --evaluation_strategy "no" \ 25 | --save_strategy "steps" \ 26 | --save_steps 24000 \ 27 | --save_total_limit 1 \ 28 | --learning_rate 1e-3 \ 29 | --weight_decay 0. \ 30 | --warmup_ratio 0.03 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --tf32 True \ 34 | --model_max_length 2048 \ 35 | --gradient_checkpointing True \ 36 | --dataloader_num_workers 4 \ 37 | --lazy_preprocess True \ 38 | --report_to "none" 39 | 40 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } --------------------------------------------------------------------------------