├── .gitignore ├── LICENSE ├── README.md ├── deploy.sh ├── docs ├── 008.jpg └── Evaluation.md ├── pyproject.toml ├── qh360_vl ├── 360vl.PNG ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── compute_precision.py │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── infer.py │ ├── m4c_evaluator.py │ ├── model_vqa.py │ ├── model_vqa_loader_llama3.py │ ├── model_vqa_loader_llama3_nodist.py │ ├── model_vqa_loader_raw.py │ ├── model_vqa_mmbench_llama3.py │ ├── model_vqa_mme_llama3.py │ ├── model_vqa_mmmu.py │ ├── model_vqa_pope_llama3.py │ ├── model_vqa_refcoco_llama3.py │ ├── model_vqa_textvqa_llama3.py │ └── summarize_gpt_review.py ├── mm_utils.py ├── model │ ├── QH360_VL_arch_cc.py │ ├── __init__.py │ ├── builder.py │ ├── language_model │ │ └── QH360_VL_llama.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ ├── builder.py │ │ ├── configuration_honeybee.py │ │ ├── pipeline │ │ │ ├── config.py │ │ │ ├── data_utils │ │ │ │ ├── special_tokens.py │ │ │ │ └── utils.py │ │ │ └── interface.py │ │ └── projectors.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ └── test_message.py └── utils.py └── scripts ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_seed_for_submission.py ├── convert_sqa_to_llava.py ├── convert_sqa_to_llava_base_prompt.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── eval ├── custom_vqa.sh ├── gqa.sh ├── infer.sh ├── llavabench.sh ├── mmb_cn.sh ├── mmb_en.sh ├── mme.sh ├── mmmu.sh ├── pope.sh ├── refcoco.sh ├── textvqa.sh └── vqav2.sh └── extract_mm_projector.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | *.pyc 4 | *.egg-info 5 | dist 6 | 7 | # Log 8 | *.log 9 | *.log.* 10 | *.json 11 | *.jsonl 12 | 13 | # Data 14 | !**/alpaca-data-conversation.json 15 | 16 | # Editor 17 | .idea 18 | *.swp 19 | 20 | # Other 21 | .DS_Store 22 | wandb 23 | output 24 | 25 | # Dir 26 | .ipynb_checkpoints/ 27 | __pycache__/ 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 360VL 2 | 3 |

4 | 5 |

6 | 7 | **360VL** is developed based on the LLama3 language model and is also the industry's first open source large multi-modal model based on **LLama3-70B**[[🤗Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)]. In addition to applying the Llama3 language model, the 360VL model also designs a globally aware multi-branch projector architecture, which enables the model to have more sufficient image understanding capabilities. 8 | 9 | 10 | ## Contents 11 | - [Install](#install) 12 | - [Model Zoo](#llava-weights) 13 | - [Demo](#Demo) 14 | - [Evaluation](#evaluation) 15 | 16 | ## Install 17 | 18 | 1. Clone this repository and navigate to 360VL folder 19 | ```bash 20 | git clone https://github.com/360CVGroup/360VL.git 21 | cd 360VL 22 | ``` 23 | 24 | 2. Install Package 25 | ```Shell 26 | conda create -n qh360_vl python=3.10 -y 27 | conda activate qh360_vl 28 | bash deploy.sh 29 | ``` 30 | 31 | ## Model Zoo 32 | | Model | Checkpoints | MMBT | MMBD|MMB-CNT | MMB-CND|MMMUV|MMMUT| MME | 33 | |:--------------------|:------------:|:----:|:------:|:------:|:-------:|:-------:|:-------:|:-------:| 34 | | QWen-VL-Chat | [🤗LINK](https://huggingface.co/Qwen/Qwen-VL-Chat) | 61.8 | 60.6 | 56.3 | 56.7 |37| 32.9 | 1860 | 35 | | mPLUG-Owl2 | [🤖LINK](https://www.modelscope.cn/models/iic/mPLUG-Owl2/summary) | 66.0 | 66.5 | 60.3 | 59.5 |34.7| 32.1 | 1786.4 | 36 | | CogVLM | [🤗LINK](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf) | 65.8| 63.7 | 55.9 | 53.8 |37.3| 30.1 | 1736.6| 37 | | Monkey-Chat | [🤗LINK](https://huggingface.co/echo840/Monkey-Chat) | 72.4| 71 | 67.5 | 65.8 |40.7| - | 1887.4| 38 | | MM1-7B-Chat | [LINK](https://ar5iv.labs.arxiv.org/html/2403.09611) | -| 72.3 | - | - |37.0| 35.6 | 1858.2| 39 | | IDEFICS2-8B | [🤗LINK](https://huggingface.co/HuggingFaceM4/idefics2-8b) | 75.7 | 75.3 | 68.6 | 67.3 |43.0| 37.7 |1847.6| 40 | | SVIT-v1.5-13B| [🤗LINK](https://huggingface.co/Isaachhe/svit-v1.5-13b-full) | 69.1 | - | 63.1 | - | 38.0| 33.3|1889| 41 | | LLaVA-v1.5-13B | [🤗LINK](https://huggingface.co/liuhaotian/llava-v1.5-13b) | 69.2 | 69.2 | 65 | 63.6 |36.4| 33.6 | 1826.7| 42 | | LLaVA-v1.6-13B | [🤗LINK](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b) | 70 | 70.7 | 68.5 | 64.3 |36.2| - |1901| 43 | | Honeybee | [LINK](https://github.com/kakaobrain/honeybee) | 73.6 | 74.3 | - | - |36.2| -|1976.5| 44 | | YI-VL-34B | [🤗LINK](https://huggingface.co/01-ai/Yi-VL-34B) | 72.4 | 71.1 | 70.7 | 71.4 |45.1| 41.6 |2050.2| 45 | | **360VL-8B** | [🤗LINK](https://huggingface.co/qihoo360/360VL-8B) | 75.3 | 73.7 | 71.1 | 68.6 |39.7| 37.1 | 1944.6| 46 | | **360VL-70B** | [🤗LINK](https://huggingface.co/qihoo360/360VL-70B) | 78.1 | 80.4 | 76.9 | 77.7 |50.8| 44.3 | 2012.3| 47 | 48 | 49 | ## Quick Start 🤗 50 | 51 | ```Shell 52 | from transformers import AutoModelForCausalLM, AutoTokenizer 53 | import torch 54 | from PIL import Image 55 | 56 | checkpoint = "qihoo360/360VL-70B" 57 | 58 | model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True).eval() 59 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) 60 | vision_tower = model.get_vision_tower() 61 | vision_tower.load_model() 62 | vision_tower.to(device="cuda", dtype=torch.float16) 63 | image_processor = vision_tower.image_processor 64 | tokenizer.pad_token = tokenizer.eos_token 65 | 66 | 67 | image = Image.open("docs/008.jpg").convert('RGB') 68 | query = "Who is this cartoon character?" 69 | terminators = [ 70 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 71 | ] 72 | 73 | inputs = model.build_conversation_input_ids(tokenizer, query=query, image=image, image_processor=image_processor) 74 | 75 | input_ids = inputs["input_ids"].to(device='cuda', non_blocking=True) 76 | images = inputs["image"].to(dtype=torch.float16, device='cuda', non_blocking=True) 77 | 78 | output_ids = model.generate( 79 | input_ids, 80 | images=images, 81 | do_sample=False, 82 | eos_token_id=terminators, 83 | num_beams=1, 84 | max_new_tokens=512, 85 | use_cache=True) 86 | 87 | input_token_len = input_ids.shape[1] 88 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 89 | outputs = outputs.strip() 90 | print(outputs) 91 | ``` 92 | 93 | ## Demo 94 | 95 | To run our demo, you need to download the weights of 360VL[🤗LINK](https://huggingface.co/qihoo360/360VL-70B) and the weights of CLIP-ViT-336[🤗LINK](https://huggingface.co/openai/clip-vit-large-patch14-336) 96 | 97 | ### Gradio Web UI 98 | 99 | To launch a Gradio demo locally, please run the following commands one by one. If you plan to launch multiple model workers to compare between different checkpoints, you only need to launch the controller and the web server *ONCE*. 100 | 101 | #### Launch a controller 102 | ```Shell 103 | python -m qh360_vl.serve.controller --host 0.0.0.0 --port 10000 104 | ``` 105 | 106 | #### Launch a gradio web server. 107 | ```Shell 108 | python -m qh360_vl.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload 109 | ``` 110 | You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker. 111 | 112 | #### Launch a model worker 113 | 114 | This is the actual *worker* that performs the inference on the GPU. Each worker is responsible for a single model specified in `--model-path`. 115 | 116 | Note that the 8B model supports single-card inference, but the 70B model requires 8-card inference. 117 | 118 | ```Shell 119 | CUDA_VISIBLE_DEVICES=0 python -m qh360_vl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path qihoo360/360VL-8B 120 | ``` 121 | 122 | ```Shell 123 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m qh360_vl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path qihoo360/360VL-70B 124 | ``` 125 | 126 | ### CLI Inference 127 | 128 | Chat about images using 360VL without the need of Gradio interface. 129 | 130 | ```Shell 131 | INIT_MODEL_PATH="/hbox2dir" 132 | name="360VL-8B" 133 | python -m qh360_vl.eval.infer \ 134 | --model-path $INIT_MODEL_PATH/$name \ 135 | ``` 136 | 137 | 138 | ### Download Llama3 checkpoints (Non-essential) 139 | 140 | 360VL is developed based on Llama 3. If you have needs, please download the weights yourself. 141 | 142 | [[🤗Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)] 143 | [[🤗Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)] 144 | 145 | ## Evaluation 146 | We refer to the evaluation data organization method of LLava-1.5, which can be found in the following documents. [Evaluation.md](docs/Evaluation.md) 147 | 148 | ```Shell 149 | bash scripts/eval/mme.sh 150 | bash scripts/eval/mmb_cn.sh 151 | bash scripts/eval/mmb_en.sh 152 | bash scripts/eval/refcoco.sh 153 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./scripts/eval/gqa.sh 154 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./scripts/eval/vqav2.sh 155 | bash scripts/eval/llavabench.sh 156 | bash scripts/eval/mmmu.sh 157 | bash scripts/eval/pope.sh 158 | bash scripts/eval/textvqa.sh 159 | ``` 160 | 161 | 162 | 163 | ## License 164 | 165 | This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. 166 | The content of this project itself is licensed under the [Apache license 2.0](./LICENSE). 167 | 168 | ## Related Projects 169 | This work wouldn't be possible without the incredible open-source code of these projects. Huge thanks! 170 | - [Meta Llama 3](https://github.com/meta-llama/llama3) 171 | - [LLaVA: Large Language and Vision Assistant](https://github.com/haotian-liu/LLaVA) 172 | - [Honeybee: Locality-enhanced Projector for Multimodal LLM](https://github.com/kakaobrain/honeybee) 173 | 174 | 175 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | # pip config set global.index-url http://mirrors.cloud.tencent.com/pypi/simple 2 | # pip config set global.trusted-host mirrors.cloud.tencent.com 3 | pip install --upgrade pip # enable PEP 660 support 4 | 5 | pip install -e . 6 | 7 | pip install ninja 8 | pip install flash-attn --no-build-isolation 9 | -------------------------------------------------------------------------------- /docs/008.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/docs/008.jpg -------------------------------------------------------------------------------- /docs/Evaluation.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs. 4 | 5 | Currently, we mostly utilize the official toolkit or server for the evaluation. 6 | 7 | ## Evaluate on Custom Datasets 8 | 9 | You can evaluate LLaVA on your custom datasets by converting your dataset to LLaVA's jsonl format, and evaluate using [`model_vqa.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/model_vqa.py). 10 | 11 | Below we provide a general guideline for evaluating datasets with some common formats. 12 | 13 | 1. Short-answer (e.g. VQAv2, MME). 14 | 15 | ``` 16 | 17 | Answer the question using a single word or phrase. 18 | ``` 19 | 20 | 2. Option-only for multiple-choice (e.g. MMBench, SEED-Bench). 21 | 22 | ``` 23 | 24 | A. 25 | B. 26 | C. 27 | D. 28 | Answer with the option's letter from the given choices directly. 29 | ``` 30 | 31 | 3. Natural QA (e.g. LLaVA-Bench, MM-Vet). 32 | 33 | No postprocessing is needed. 34 | 35 | ## Scripts 36 | 37 | Before preparing task-specific data, **you MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `./playground/data/eval`. This also provides a general structure for all datasets. 38 | 39 | ### VQAv2 40 | 41 | 1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `./playground/data/eval/vqav2`. 42 | 2. Multi-GPU inference. 43 | ```Shell 44 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh 45 | ``` 46 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`. 47 | 48 | ### GQA 49 | 50 | 1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release. 51 | 2. Multi-GPU inference. 52 | ```Shell 53 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh 54 | ``` 55 | 56 | ### VisWiz 57 | 58 | 1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`. 59 | 2. Single-GPU inference. 60 | ```Shell 61 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh 62 | ``` 63 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`. 64 | 65 | ### ScienceQA 66 | 67 | 1. Under `./playground/data/eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA). 68 | 2. Single-GPU inference and evaluate. 69 | ```Shell 70 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh 71 | ``` 72 | 73 | ### TextVQA 74 | 75 | 1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `./playground/data/eval/textvqa`. 76 | 2. Single-GPU inference and evaluate. 77 | ```Shell 78 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh 79 | ``` 80 | 81 | ### POPE 82 | 83 | 1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`. 84 | 2. Single-GPU inference and evaluate. 85 | ```Shell 86 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh 87 | ``` 88 | 89 | ### MME 90 | 91 | 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation). 92 | 2. Downloaded images to `MME_Benchmark_release_version`. 93 | 3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`. 94 | 4. Single-GPU inference and evaluate. 95 | ```Shell 96 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh 97 | ``` 98 | 99 | ### MMBench 100 | 101 | 1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`. 102 | 2. Single-GPU inference. 103 | ```Shell 104 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh 105 | ``` 106 | 3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`. 107 | 108 | ### MMBench-CN 109 | 110 | 1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `./playground/data/eval/mmbench`. 111 | 2. Single-GPU inference. 112 | ```Shell 113 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench_cn.sh 114 | ``` 115 | 3. Submit the results to the evaluation server: `./playground/data/eval/mmbench/answers_upload/mmbench_dev_cn_20231003`. 116 | 117 | 118 | ### SEED-Bench 119 | 120 | 1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `./playground/data/eval/seed_bench/SEED-Bench-image`. 121 | 2. Extract the video frame in the middle from the downloaded videos, and put them under `./playground/data/eval/seed_bench/SEED-Bench-video-image`. We provide our script `extract_video_frames.py` modified from the official one. 122 | 3. Multiple-GPU inference and evaluate. 123 | ```Shell 124 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/seed.sh 125 | ``` 126 | 4. Optionally, submit the results to the leaderboard: `./playground/data/eval/seed_bench/answers_upload` using the official jupyter notebook. 127 | 128 | ### LLaVA-Bench-in-the-Wild 129 | 130 | 1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `./playground/data/eval/llava-bench-in-the-wild`. 131 | 2. Single-GPU inference and evaluate. 132 | ```Shell 133 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/llavabench.sh 134 | ``` 135 | 136 | ### MM-Vet 137 | 138 | 1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`. 139 | 2. Single-GPU inference. 140 | ```Shell 141 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh 142 | ``` 143 | 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook. 144 | 145 | ## More Benchmarks 146 | 147 | Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release. 148 | 149 | ### Q-Bench 150 | 151 | 1. Download [`llvisionqa_dev.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_dev.json) (for `dev`-subset) and [`llvisionqa_test.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_test.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 152 | 2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`. 153 | 3. Single-GPU inference (change `dev` to `test` for evaluation on test set). 154 | ```Shell 155 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench.sh dev 156 | ``` 157 | 4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_dev_answers.jsonl`. 158 | 159 | ### Chinese-Q-Bench 160 | 161 | 1. Download [`质衡-问答-验证集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E9%AA%8C%E8%AF%81%E9%9B%86.json) (for `dev`-subset) and [`质衡-问答-测试集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E6%B5%8B%E8%AF%95%E9%9B%86.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 162 | 2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`. 163 | 3. Single-GPU inference (change `dev` to `test` for evaluation on test set). 164 | ```Shell 165 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench_zh.sh dev 166 | ``` 167 | 4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_zh_dev_answers.jsonl`. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "qh360_vl" 7 | version = "1.0.0" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "einops", "fastapi", "gradio==3.35.2", "markdown2[all]", "numpy", 17 | "requests", "sentencepiece", "tokenizers>=0.12.1", 18 | "uvicorn", "wandb", 19 | "shortuuid", "httpx==0.24.0", 20 | "deepspeed==0.9.5", 21 | "peft==0.4.0", 22 | "transformers==4.37.2", 23 | "accelerate==0.29.3", 24 | "bitsandbytes==0.41.0", 25 | "scikit-learn==1.2.2", 26 | "sentencepiece==0.1.99", 27 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 28 | "gradio_client==0.2.9" 29 | ] 30 | 31 | [project.urls] 32 | "Homepage" = "https://github.com/360CVGroup/360VL" 33 | "Bug Tracker" = "https://github.com/360CVGroup/360VL/issues" 34 | 35 | [tool.setuptools.packages.find] 36 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 37 | 38 | [tool.wheel] 39 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] -------------------------------------------------------------------------------- /qh360_vl/360vl.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/360vl.PNG -------------------------------------------------------------------------------- /qh360_vl/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import QH360_VL_LlamaForCausalLM 2 | -------------------------------------------------------------------------------- /qh360_vl/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /qh360_vl/eval/compute_precision.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | 4 | 5 | def ori_bbox(nor_bbox, img_size): 6 | w = img_size[0] 7 | h = img_size[1] 8 | if w>h: 9 | big = w 10 | border_w = 0 11 | border_h = (w-h)//2 12 | elif w= right_line or top_line >= bottom_line: 49 | return 0 50 | else: 51 | intersect = (right_line - left_line) * (bottom_line - top_line) 52 | return intersect / (sum_area - intersect) * 1.0 53 | 54 | 55 | gt_info = {} 56 | with open('./playground/data/eval/refcoco/REFCOCO_VAL_en_new.jsonl', "r") as f: 57 | for line in tqdm(f): 58 | info = json.loads(line) 59 | gt_info[info['sent_id']] = {'bbox': info['bbox'], 'height': info['height'], 'width': info['width']} 60 | 61 | 62 | import sys 63 | with open(sys.argv[1], "r") as f: 64 | iou_thresh = 0.5 65 | tp = 0 66 | fp = 0 67 | for line in tqdm(f): 68 | info = json.loads(line) 69 | idx = info['question_id'] 70 | pred = info['text'] 71 | try: 72 | gt = gt_info[idx] 73 | gt_bbox = gt['bbox'] 74 | # print('gt:',gt_bbox) 75 | 76 | pred_bboxs = pred.split('; ') 77 | num_bboxs = len(pred_bboxs) 78 | for i, pred_bbox in enumerate(pred_bboxs): 79 | pred_bbox = eval(pred_bbox) 80 | 81 | pred_bbox = ori_bbox(pred_bbox, [gt['width'], gt['height']]) 82 | # print('pred:',pred_bbox,'gt:',gt_bbox) 83 | 84 | iou = compute_iou(pred_bbox, gt_bbox) 85 | if iou >= iou_thresh: 86 | tp += 1 87 | break 88 | else: 89 | if i == num_bboxs - 1: 90 | fp += 1 91 | except: 92 | print(pred) 93 | fp += 1 94 | precision = tp / (tp + fp) 95 | print(f'==== REC RESULT: precision = {precision}, tp = {tp}, fp = {fp}') 96 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | return f1 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument("--annotation-dir", type=str) 68 | parser.add_argument("--question-file", type=str) 69 | parser.add_argument("--result-file", type=str) 70 | args = parser.parse_args() 71 | 72 | questions = [json.loads(line) for line in open(args.question_file)] 73 | questions = {question['question_id']: question for question in questions} 74 | answers = [json.loads(q) for q in open(args.result_file)] 75 | 76 | f1s = [] 77 | for file in os.listdir(args.annotation_dir): 78 | assert file.startswith('coco_pope_') 79 | assert file.endswith('.json') 80 | category = file[10:-5] 81 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 82 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 83 | f1 = eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 84 | f1s.append(f1) 85 | print("====================================") 86 | print("f1 mean:", sum(f1s) / len(f1s)) 87 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_science_qa_gpt4_requery.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--requery-result', type=str) 14 | parser.add_argument('--our-result', type=str) 15 | parser.add_argument('--output-result', type=str) 16 | parser.add_argument('--split', type=str, default='test') 17 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 18 | return parser.parse_args() 19 | 20 | 21 | def convert_caps(results): 22 | fakecaps = [] 23 | for result in results: 24 | image_id = result['question_id'] 25 | caption = result['text'] 26 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 27 | return fakecaps 28 | 29 | 30 | def get_pred_idx(prediction, choices, options): 31 | """ 32 | Get the index (e.g. 2) from the prediction (e.g. 'C') 33 | """ 34 | if prediction in options[:len(choices)]: 35 | return options.index(prediction) 36 | else: 37 | return random.choice(range(len(choices))) 38 | 39 | 40 | if __name__ == "__main__": 41 | args = get_args() 42 | 43 | base_dir = args.base_dir 44 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 45 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 46 | our_predictions = [json.loads(line) for line in open(args.our_result)] 47 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 48 | split_problems = {idx: problems[idx] for idx in split_indices} 49 | 50 | requery_predictions = [json.loads(line) for line in open(args.requery_result)] 51 | requery_predictions = {pred['question_id']: pred for pred in requery_predictions} 52 | 53 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 54 | 55 | results = defaultdict(lambda: 0) 56 | 57 | sqa_results = {} 58 | sqa_results['acc'] = None 59 | sqa_results['correct'] = None 60 | sqa_results['count'] = None 61 | sqa_results['results'] = {} 62 | sqa_results['outputs'] = {} 63 | 64 | for prob_id, prob in split_problems.items(): 65 | if prob_id not in our_predictions: 66 | assert False 67 | if prob_id not in gpt4_predictions: 68 | assert False 69 | our_pred = our_predictions[prob_id]['text'] 70 | gpt4_pred = gpt4_predictions[prob_id] 71 | if prob_id not in requery_predictions: 72 | results['missing_requery'] += 1 73 | requery_pred = "MISSING" 74 | else: 75 | requery_pred = requery_predictions[prob_id]['text'] 76 | 77 | pattern = re.compile(r'The answer is ([A-Z]).') 78 | our_res = pattern.findall(our_pred) 79 | if len(our_res) == 1: 80 | our_answer = our_res[0] # 'A', 'B', ... 81 | else: 82 | our_answer = "FAILED" 83 | 84 | requery_res = pattern.findall(requery_pred) 85 | if len(requery_res) == 1: 86 | requery_answer = requery_res[0] # 'A', 'B', ... 87 | else: 88 | requery_answer = "FAILED" 89 | 90 | gpt4_res = pattern.findall(gpt4_pred) 91 | if len(gpt4_res) == 1: 92 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 93 | else: 94 | gpt4_answer = "FAILED" 95 | 96 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 97 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 98 | requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options) 99 | 100 | results['total'] += 1 101 | 102 | if gpt4_answer == 'FAILED': 103 | results['gpt4_failed'] += 1 104 | if gpt4_pred_idx == prob['answer']: 105 | results['gpt4_correct'] += 1 106 | if our_pred_idx == prob['answer']: 107 | results['gpt4_ourvisual_correct'] += 1 108 | elif gpt4_pred_idx == prob['answer']: 109 | results['gpt4_correct'] += 1 110 | results['gpt4_ourvisual_correct'] += 1 111 | 112 | if our_pred_idx == prob['answer']: 113 | results['our_correct'] += 1 114 | 115 | if requery_answer == 'FAILED': 116 | sqa_results['results'][prob_id] = our_pred_idx 117 | if our_pred_idx == prob['answer']: 118 | results['requery_correct'] += 1 119 | else: 120 | sqa_results['results'][prob_id] = requery_pred_idx 121 | if requery_pred_idx == prob['answer']: 122 | results['requery_correct'] += 1 123 | else: 124 | print(f""" 125 | Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']} 126 | Our ({our_answer}): {our_pred} 127 | GPT-4 ({gpt4_answer}): {gpt4_pred} 128 | Requery ({requery_answer}): {requery_pred} 129 | print("=====================================") 130 | """) 131 | 132 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 133 | results['correct_upperbound'] += 1 134 | 135 | total = results['total'] 136 | print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%') 137 | print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%') 138 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 139 | print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%') 140 | print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%') 141 | print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 142 | 143 | sqa_results['acc'] = results["requery_correct"] / total * 100 144 | sqa_results['correct'] = results["requery_correct"] 145 | sqa_results['count'] = total 146 | 147 | with open(args.output_result, 'w') as f: 148 | json.dump(sqa_results, f, indent=2) 149 | 150 | -------------------------------------------------------------------------------- /qh360_vl/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from qh360_vl.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | print(result['text'],'<======================>',annotation['answers']) 50 | 51 | evaluator = TextVQAAccuracyEvaluator() 52 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 53 | 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | 58 | if args.result_file is not None: 59 | eval_single(args.annotation_file, args.result_file) 60 | 61 | if args.result_dir is not None: 62 | for result_file in sorted(os.listdir(args.result_dir)): 63 | if not result_file.endswith('.jsonl'): 64 | print(f'Skipping {result_file}') 65 | continue 66 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 67 | -------------------------------------------------------------------------------- /qh360_vl/eval/infer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | import pdb 19 | import sys 20 | from pprint import pprint as pp 21 | 22 | g_input_msg = [ 23 | { 24 | "role": "system", 25 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 26 | } 27 | ] 28 | 29 | 30 | def get_input(tokenizer, image_processor, model_config, rounds, query, args): 31 | g_input_msg.append({ 32 | "role": "user", 33 | "content": ("<|reserved_special_token_44|>"+ '\n' if not rounds else "") + query 34 | }) 35 | 36 | input_ids = tokenizer.apply_chat_template( 37 | g_input_msg, 38 | add_generation_prompt=True, 39 | padding="longest", 40 | return_tensors="pt", 41 | ) 42 | input_id_list = input_ids[0].tolist() 43 | input_id_list[input_id_list.index(128049)]=-200 44 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 45 | 46 | image = Image.open(args.image_path).convert('RGB') 47 | if args.slide_window: 48 | image_tensor = process_images_slid_window(image, image_processor, model_config, None, None, 336) 49 | else: 50 | image_tensor = process_images([image], image_processor, model_config)[0] 51 | 52 | return input_ids.unsqueeze(0), image_tensor.unsqueeze(0) 53 | 54 | 55 | def infer_model(args): 56 | # Model 57 | disable_torch_init() 58 | model_path = os.path.expanduser(args.model_path) 59 | model_name = get_model_name_from_path(model_path) 60 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 61 | tokenizer.pad_token = tokenizer.eos_token 62 | 63 | rounds = 0 64 | while 1: 65 | try: 66 | query = input("user: ") 67 | if query == "exit": 68 | break 69 | except: 70 | continue 71 | 72 | input_ids, image_tensor = get_input(tokenizer, image_processor, model.config, rounds, query, args) 73 | input_ids = input_ids.to(device='cuda', non_blocking=True) 74 | 75 | with torch.inference_mode(): 76 | output_ids = model.generate( 77 | input_ids, 78 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 79 | do_sample=True if args.temperature > 0 else False, 80 | temperature=args.temperature, 81 | eos_token_id=[tokenizer.convert_tokens_to_ids("<|eot_id|>",)], 82 | top_p=args.top_p, 83 | num_beams=args.num_beams, 84 | max_new_tokens=128, 85 | use_cache=True) 86 | 87 | input_token_len = input_ids.shape[1] 88 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 89 | if n_diff_input_output > 0: 90 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 91 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 92 | outputs = outputs.strip() 93 | print("qh360_vl:", outputs) 94 | 95 | g_input_msg.append({ 96 | "role": "assistant", 97 | "content": outputs 98 | }) 99 | rounds += 1 100 | 101 | 102 | 103 | 104 | if __name__ == "__main__": 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument("--model-path", type=str, default=None) 107 | parser.add_argument("--image-path", type=str, default=None) 108 | parser.add_argument("--model-base", type=str, default=None) 109 | parser.add_argument("--temperature", type=float, default=0) 110 | parser.add_argument("--top_p", type=float, default=None) 111 | parser.add_argument("--num_beams", type=int, default=1) 112 | parser.add_argument("--slide_window", action="store_true") 113 | args = parser.parse_args() 114 | 115 | infer_model(args) -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from qh360_vl.conversation import conv_templates, SeparatorStyle 10 | from qh360_vl.model.builder import load_pretrained_model 11 | from qh360_vl.utils import disable_torch_init 12 | from qh360_vl.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria,process_images_slid_window 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | tokenizer.pad_token = tokenizer.eos_token 36 | 37 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 38 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 39 | answers_file = os.path.expanduser(args.answers_file) 40 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 41 | ans_file = open(answers_file, "w") 42 | for line in tqdm(questions): 43 | idx = line["question_id"] 44 | image_file = line["image"] 45 | qs = line["text"] 46 | cur_prompt = qs 47 | if model.config.mm_use_im_start_end: 48 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 49 | else: 50 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 51 | 52 | input_msg = [ 53 | { 54 | "role": "system", 55 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 56 | }, 57 | { 58 | "role": "user", 59 | "content": qs 60 | } 61 | ] 62 | input_ids = tokenizer.apply_chat_template( 63 | input_msg, 64 | add_generation_prompt=True, 65 | padding="longest", 66 | return_tensors="pt", 67 | ) 68 | input_id_list = input_ids[0].tolist() 69 | input_id_list[input_id_list.index(128049)]=-200 70 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device).unsqueeze(0).cuda() 71 | 72 | image = Image.open(os.path.join(args.image_folder, image_file)) 73 | 74 | if args.slide_window: 75 | image_tensor = process_images_slid_window(image, image_processor, model.config, None, None, 336) 76 | else: 77 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 78 | 79 | terminators = [ 80 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 81 | ] 82 | 83 | with torch.inference_mode(): 84 | output_ids = model.generate( 85 | input_ids, 86 | images=image_tensor.unsqueeze(0).half().cuda(), 87 | do_sample=True if args.temperature > 0 else False, 88 | temperature=args.temperature, 89 | eos_token_id=terminators, 90 | top_p=args.top_p, 91 | num_beams=args.num_beams, 92 | # no_repeat_ngram_size=3, 93 | max_new_tokens=1024, 94 | use_cache=True) 95 | 96 | input_token_len = input_ids.shape[1] 97 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 98 | if n_diff_input_output > 0: 99 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 100 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 101 | 102 | outputs = outputs.strip() 103 | 104 | ans_id = shortuuid.uuid() 105 | ans_file.write(json.dumps({"question_id": idx, 106 | "prompt": cur_prompt, 107 | "text": outputs, 108 | "answer_id": ans_id, 109 | "model_id": model_name, 110 | "metadata": {}}) + "\n") 111 | ans_file.flush() 112 | ans_file.close() 113 | 114 | if __name__ == "__main__": 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 117 | parser.add_argument("--model-base", type=str, default=None) 118 | parser.add_argument("--image-folder", type=str, default="") 119 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 120 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 121 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 122 | parser.add_argument("--num-chunks", type=int, default=1) 123 | parser.add_argument("--chunk-idx", type=int, default=0) 124 | parser.add_argument("--temperature", type=float, default=0.2) 125 | parser.add_argument("--top_p", type=float, default=None) 126 | parser.add_argument("--num_beams", type=int, default=1) 127 | parser.add_argument("--slide_window", action="store_true") 128 | args = parser.parse_args() 129 | 130 | eval_model(args) 131 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_loader_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | 46 | input_msg = [ 47 | { 48 | "role": "system", 49 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 50 | }, 51 | { 52 | "role": "user", 53 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 54 | } 55 | ] 56 | input_ids = self.tokenizer.apply_chat_template( 57 | input_msg, 58 | add_generation_prompt=True, 59 | padding="longest", 60 | return_tensors="pt", 61 | ) 62 | 63 | input_id_list = input_ids[0].tolist() 64 | input_id_list[input_id_list.index(128049)]=-200 65 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 66 | 67 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 68 | 69 | if self.args.slide_window: 70 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 71 | else: 72 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 73 | 74 | return input_ids, image_tensor 75 | 76 | def __len__(self): 77 | return len(self.questions) 78 | 79 | 80 | # DataLoader 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 82 | assert batch_size == 1, "batch_size must be 1" 83 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 84 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 85 | return data_loader 86 | 87 | 88 | def eval_model(args): 89 | # Model 90 | torch.distributed.init_process_group( 91 | backend='nccl', 92 | world_size=int(os.getenv('WORLD_SIZE', '1')), 93 | rank=int(os.getenv('RANK', '0')), 94 | ) 95 | 96 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 97 | 98 | disable_torch_init() 99 | model_path = os.path.expanduser(args.model_path) 100 | model_name = get_model_name_from_path(model_path) 101 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 102 | tokenizer.pad_token = tokenizer.eos_token 103 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 104 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 105 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 106 | 107 | answers_file = os.path.expanduser(args.answers_file) 108 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 109 | ans_file = open(answers_file, "w") 110 | 111 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 112 | args.conv_mode = args.conv_mode + '_mmtag' 113 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 114 | 115 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 116 | 117 | all_outputs = [] 118 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 119 | idx = line["question_id"] 120 | cur_prompt = line["text"] 121 | 122 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 123 | input_ids = input_ids.to(device='cuda', non_blocking=True) 124 | terminators = [ 125 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 126 | ] 127 | 128 | with torch.inference_mode(): 129 | output_ids = model.generate( 130 | input_ids, 131 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 132 | do_sample=True if args.temperature > 0 else False, 133 | temperature=args.temperature, 134 | eos_token_id=terminators, 135 | top_p=args.top_p, 136 | num_beams=args.num_beams, 137 | max_new_tokens=1280, 138 | use_cache=True) 139 | 140 | input_token_len = input_ids.shape[1] 141 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 142 | if n_diff_input_output > 0: 143 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 144 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 145 | outputs = outputs.strip() 146 | print(outputs) 147 | 148 | ans_id = shortuuid.uuid() 149 | 150 | all_outputs.append({"question_id": idx, 151 | "prompt": cur_prompt, 152 | "text": outputs, 153 | "answer_id": ans_id, 154 | "model_id": model_name, 155 | "metadata": {}}) 156 | 157 | torch.distributed.barrier() 158 | 159 | world_size = torch.distributed.get_world_size() 160 | merged_outputs = [None for _ in range(world_size)] 161 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 162 | 163 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 164 | 165 | if torch.distributed.get_rank() == 0: 166 | for item in merged_outputs: 167 | ans_file.write(json.dumps(item) + "\n") 168 | ans_file.close() 169 | torch.distributed.barrier() 170 | 171 | if __name__ == "__main__": 172 | parser = argparse.ArgumentParser() 173 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 174 | parser.add_argument("--model-base", type=str, default=None) 175 | parser.add_argument("--image-folder", type=str, default="") 176 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 177 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 178 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 179 | parser.add_argument("--num-chunks", type=int, default=1) 180 | parser.add_argument("--chunk-idx", type=int, default=0) 181 | parser.add_argument("--temperature", type=float, default=0.2) 182 | parser.add_argument("--top_p", type=float, default=None) 183 | parser.add_argument("--num_beams", type=int, default=1) 184 | parser.add_argument("--slide_window", action="store_true") 185 | args = parser.parse_args() 186 | 187 | eval_model(args) 188 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_loader_llama3_nodist.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | input_msg = [ 46 | { 47 | "role": "system", 48 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 49 | }, 50 | { 51 | "role": "user", 52 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 53 | } 54 | ] 55 | input_ids = self.tokenizer.apply_chat_template( 56 | input_msg, 57 | add_generation_prompt=True, 58 | padding="longest", 59 | return_tensors="pt", 60 | ) 61 | 62 | input_id_list = input_ids[0].tolist() 63 | input_id_list[input_id_list.index(128049)]=-200 64 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 65 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 66 | if self.args.slide_window: 67 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 68 | else: 69 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 70 | 71 | return input_ids, image_tensor 72 | 73 | 74 | def __len__(self): 75 | return len(self.questions) 76 | 77 | 78 | # DataLoader 79 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 80 | assert batch_size == 1, "batch_size must be 1" 81 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 82 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 83 | return data_loader 84 | 85 | 86 | def eval_model(args): 87 | # Model 88 | disable_torch_init() 89 | model_path = os.path.expanduser(args.model_path) 90 | model_name = get_model_name_from_path(model_path) 91 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 92 | tokenizer.pad_token = tokenizer.eos_token 93 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 94 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 95 | 96 | answers_file = os.path.expanduser(args.answers_file) 97 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 98 | ans_file = open(answers_file, "w") 99 | 100 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 101 | args.conv_mode = args.conv_mode + '_mmtag' 102 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 103 | 104 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 105 | 106 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 107 | idx = line["question_id"] 108 | cur_prompt = line["text"] 109 | 110 | input_ids = input_ids.to(device='cuda', non_blocking=True) 111 | terminators = [ 112 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 113 | ] 114 | 115 | with torch.inference_mode(): 116 | output_ids = model.generate( 117 | input_ids, 118 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 119 | do_sample=True if args.temperature > 0 else False, 120 | temperature=args.temperature, 121 | eos_token_id=terminators, 122 | top_p=args.top_p, 123 | num_beams=args.num_beams, 124 | max_new_tokens=1280, 125 | use_cache=True) 126 | 127 | input_token_len = input_ids.shape[1] 128 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 129 | if n_diff_input_output > 0: 130 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 131 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 132 | outputs = outputs.strip() 133 | print(outputs) 134 | 135 | ans_id = shortuuid.uuid() 136 | ans_file.write(json.dumps({"question_id": idx, 137 | "prompt": cur_prompt, 138 | "text": outputs, 139 | "answer_id": ans_id, 140 | "model_id": model_name, 141 | "metadata": {}}) + "\n") 142 | 143 | ans_file.close() 144 | 145 | 146 | if __name__ == "__main__": 147 | parser = argparse.ArgumentParser() 148 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 149 | parser.add_argument("--model-base", type=str, default=None) 150 | parser.add_argument("--image-folder", type=str, default="") 151 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 152 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 153 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 154 | parser.add_argument("--num-chunks", type=int, default=1) 155 | parser.add_argument("--chunk-idx", type=int, default=0) 156 | parser.add_argument("--temperature", type=float, default=0.2) 157 | parser.add_argument("--top_p", type=float, default=None) 158 | parser.add_argument("--num_beams", type=int, default=1) 159 | parser.add_argument("--slide_window", action="store_true") 160 | args = parser.parse_args() 161 | 162 | eval_model(args) 163 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_loader_raw.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | 46 | input_msg = [ 47 | { 48 | "role": "system", 49 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 50 | }, 51 | { 52 | "role": "user", 53 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 54 | } 55 | ] 56 | input_ids = self.tokenizer.apply_chat_template( 57 | input_msg, 58 | add_generation_prompt=True, 59 | padding="longest", 60 | return_tensors="pt", 61 | ) 62 | 63 | input_id_list = input_ids[0].tolist() 64 | input_id_list[input_id_list.index(128049)]=-200 65 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 66 | 67 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 68 | 69 | if self.args.slide_window: 70 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 71 | else: 72 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 73 | 74 | return input_ids, image_tensor 75 | 76 | def __len__(self): 77 | return len(self.questions) 78 | 79 | 80 | # DataLoader 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 82 | assert batch_size == 1, "batch_size must be 1" 83 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 84 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 85 | return data_loader 86 | 87 | 88 | def eval_model(args): 89 | # Model 90 | disable_torch_init() 91 | model_path = os.path.expanduser(args.model_path) 92 | model_name = get_model_name_from_path(model_path) 93 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 94 | tokenizer.pad_token = tokenizer.eos_token 95 | 96 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 97 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 98 | answers_file = os.path.expanduser(args.answers_file) 99 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 100 | ans_file = open(answers_file, "w") 101 | 102 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: # False 103 | args.conv_mode = args.conv_mode + '_mmtag' 104 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 105 | 106 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 107 | 108 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 109 | idx = line["question_id"] 110 | cur_prompt = line["text"] 111 | 112 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 # '' 113 | input_ids = input_ids.to(device='cuda', non_blocking=True) 114 | terminators = [ 115 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 116 | ] 117 | 118 | with torch.inference_mode(): 119 | output_ids = model.generate( 120 | input_ids, 121 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 122 | do_sample=True if args.temperature > 0 else False, 123 | temperature=args.temperature, 124 | eos_token_id=terminators, 125 | top_p=args.top_p, 126 | num_beams=args.num_beams, 127 | max_new_tokens=128, 128 | use_cache=True) 129 | 130 | input_token_len = input_ids.shape[1] 131 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 132 | if n_diff_input_output > 0: 133 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 134 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 135 | outputs = outputs.strip() 136 | 137 | ans_id = shortuuid.uuid() 138 | ans_file.write(json.dumps({"question_id": idx, 139 | "prompt": cur_prompt, 140 | "text": outputs, 141 | "answer_id": ans_id, 142 | "model_id": model_name, 143 | "metadata": {}}, 144 | ensure_ascii=False) + "\n") 145 | ans_file.flush() 146 | ans_file.close() 147 | 148 | if __name__ == "__main__": 149 | parser = argparse.ArgumentParser() 150 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 151 | parser.add_argument("--model-base", type=str, default=None) 152 | parser.add_argument("--image-folder", type=str, default="") 153 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 154 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 155 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 156 | parser.add_argument("--num-chunks", type=int, default=1) 157 | parser.add_argument("--chunk-idx", type=int, default=0) 158 | parser.add_argument("--temperature", type=float, default=0.2) 159 | parser.add_argument("--top_p", type=float, default=None) 160 | parser.add_argument("--num_beams", type=int, default=1) 161 | parser.add_argument("--slide_window", action="store_true") 162 | args = parser.parse_args() 163 | 164 | eval_model(args) 165 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_mmbench_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | import shortuuid 8 | import itertools 9 | 10 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 11 | from qh360_vl.conversation import conv_templates, SeparatorStyle 12 | from qh360_vl.model.builder import load_pretrained_model 13 | from qh360_vl.utils import disable_torch_init 14 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path,process_images_slid_window 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | all_options = ['A', 'B', 'C', 'D'] 21 | 22 | 23 | def split_list(lst, n): 24 | """Split a list into n (roughly) equal-sized chunks""" 25 | chunk_size = math.ceil(len(lst) / n) # integer division 26 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 27 | 28 | 29 | def get_chunk(lst, n, k): 30 | chunks = split_list(lst, n) 31 | return chunks[k] 32 | 33 | 34 | def is_none(value): 35 | if value is None: 36 | return True 37 | if type(value) is float and math.isnan(value): 38 | return True 39 | if type(value) is str and value.lower() == 'nan': 40 | return True 41 | if type(value) is str and value.lower() == 'none': 42 | return True 43 | return False 44 | 45 | def get_options(row, options): 46 | parsed_options = [] 47 | for option in options: 48 | option_value = row[option] 49 | if is_none(option_value): 50 | break 51 | parsed_options.append(option_value) 52 | return parsed_options 53 | 54 | 55 | def eval_model(args): 56 | # Model 57 | 58 | torch.distributed.init_process_group( 59 | backend='nccl', 60 | world_size=int(os.getenv('WORLD_SIZE', '1')), 61 | rank=int(os.getenv('RANK', '0')), 62 | ) 63 | 64 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 65 | 66 | disable_torch_init() 67 | model_path = os.path.expanduser(args.model_path) 68 | model_name = get_model_name_from_path(model_path) 69 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 70 | tokenizer.pad_token = tokenizer.eos_token 71 | 72 | questions = pd.read_table(os.path.expanduser(args.question_file)) 73 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 74 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 75 | 76 | answers_file = os.path.expanduser(args.answers_file) 77 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 78 | ans_file = open(answers_file, "w") 79 | 80 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 81 | args.conv_mode = args.conv_mode + '_mmtag' 82 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 83 | 84 | all_outputs = [] 85 | for index, row in tqdm(questions.iterrows(), total=len(questions)): 86 | options = get_options(row, all_options) 87 | cur_option_char = all_options[:len(options)] 88 | 89 | if args.all_rounds: 90 | num_rounds = len(options) 91 | else: 92 | num_rounds = 1 93 | 94 | for round_idx in range(num_rounds): 95 | idx = row['index'] 96 | question = row['question'] 97 | hint = row['hint'] 98 | image = load_image_from_base64(row['image']) 99 | if not is_none(hint): 100 | question = hint + '\n' + question 101 | for option_char, option in zip(all_options[:len(options)], options): 102 | question = question + '\n' + option_char + '. ' + option 103 | qs = cur_prompt = question 104 | 105 | qs = "<|reserved_special_token_44|>" + '\n' + qs 106 | 107 | if args.single_pred_prompt: 108 | if args.lang == 'cn': 109 | qs = qs + '\n' + "请直接回答选项字母。" 110 | else: 111 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 112 | input_msg = [ 113 | { 114 | "role": "system", 115 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 116 | }, 117 | { 118 | "role": "user", 119 | "content": qs 120 | } 121 | ] 122 | input_ids = tokenizer.apply_chat_template( 123 | input_msg, 124 | add_generation_prompt=True, 125 | padding="longest", 126 | return_tensors="pt", 127 | ) 128 | input_id_list = input_ids[0].tolist() 129 | input_id_list[input_id_list.index(128049)]=-200 130 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device).unsqueeze(0).cuda() 131 | if args.slide_window: 132 | image_tensor = process_images_slid_window(image, image_processor, model.config, None, None, 336) 133 | else: 134 | image_tensor = process_images([image], image_processor, model.config)[0] 135 | 136 | terminators = [ 137 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 138 | ] 139 | 140 | with torch.inference_mode(): 141 | output_ids = model.generate( 142 | input_ids, 143 | images=image_tensor.unsqueeze(0).half().cuda(), 144 | do_sample=True if args.temperature > 0 else False, 145 | temperature=args.temperature, 146 | eos_token_id=terminators, 147 | top_p=args.top_p, 148 | num_beams=args.num_beams, 149 | # no_repeat_ngram_size=3, 150 | max_new_tokens=1024, 151 | use_cache=True) 152 | 153 | input_token_len = input_ids.shape[1] 154 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 155 | if n_diff_input_output > 0: 156 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 157 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 158 | outputs = outputs.strip() 159 | print(outputs) 160 | 161 | ans_id = shortuuid.uuid() 162 | all_outputs.append({"question_id": idx, 163 | "round_id": round_idx, 164 | "prompt": cur_prompt, 165 | "text": outputs, 166 | "options": options, 167 | "option_char": cur_option_char, 168 | "answer_id": ans_id, 169 | "model_id": model_name, 170 | "metadata": {}}) 171 | # rotate options 172 | options = options[1:] + options[:1] 173 | cur_option_char = cur_option_char[1:] + cur_option_char[:1] 174 | 175 | torch.distributed.barrier() 176 | 177 | world_size = torch.distributed.get_world_size() 178 | merged_outputs = [None for _ in range(world_size)] 179 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 180 | 181 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 182 | 183 | if torch.distributed.get_rank() == 0: 184 | for item in merged_outputs: 185 | ans_file.write(json.dumps(item) + "\n") 186 | ans_file.close() 187 | torch.distributed.barrier() 188 | 189 | if __name__ == "__main__": 190 | parser = argparse.ArgumentParser() 191 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 192 | parser.add_argument("--model-base", type=str, default=None) 193 | parser.add_argument("--image-folder", type=str, default="") 194 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 195 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 196 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 197 | parser.add_argument("--num-chunks", type=int, default=1) 198 | parser.add_argument("--chunk-idx", type=int, default=0) 199 | parser.add_argument("--temperature", type=float, default=0.2) 200 | parser.add_argument("--top_p", type=float, default=None) 201 | parser.add_argument("--num_beams", type=int, default=1) 202 | parser.add_argument("--all-rounds", action="store_true") 203 | parser.add_argument("--single-pred-prompt", action="store_true") 204 | parser.add_argument("--lang", type=str, default="en") 205 | parser.add_argument("--slide_window", action="store_true") 206 | args = parser.parse_args() 207 | 208 | eval_model(args) 209 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_mme_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | qs = qs.replace("\nAnswer the question using a single word or phrase.", " Please answer yes or no.") # open compass 46 | 47 | input_msg = [ 48 | { 49 | "role": "system", 50 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 51 | }, 52 | { 53 | "role": "user", 54 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 55 | } 56 | ] 57 | input_ids = self.tokenizer.apply_chat_template( 58 | input_msg, 59 | add_generation_prompt=True, 60 | padding="longest", 61 | return_tensors="pt", 62 | ) 63 | 64 | input_id_list = input_ids[0].tolist() 65 | input_id_list[input_id_list.index(128049)]=-200 66 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 67 | 68 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 69 | 70 | if self.args.slide_window: 71 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 72 | else: 73 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 74 | 75 | return input_ids, image_tensor 76 | 77 | def __len__(self): 78 | return len(self.questions) 79 | 80 | 81 | # DataLoader 82 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 83 | assert batch_size == 1, "batch_size must be 1" 84 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 85 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 86 | return data_loader 87 | 88 | 89 | def eval_model(args): 90 | # Model 91 | torch.distributed.init_process_group( 92 | backend='nccl', 93 | world_size=int(os.getenv('WORLD_SIZE', '1')), 94 | rank=int(os.getenv('RANK', '0')), 95 | ) 96 | 97 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 98 | 99 | disable_torch_init() 100 | model_path = os.path.expanduser(args.model_path) 101 | model_name = get_model_name_from_path(model_path) 102 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 103 | tokenizer.pad_token = tokenizer.eos_token 104 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 105 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 106 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 107 | 108 | answers_file = os.path.expanduser(args.answers_file) 109 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 110 | ans_file = open(answers_file, "w") 111 | 112 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 113 | args.conv_mode = args.conv_mode + '_mmtag' 114 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 115 | 116 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 117 | 118 | all_outputs = [] 119 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 120 | idx = line["question_id"] 121 | cur_prompt = line["text"] 122 | 123 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 124 | input_ids = input_ids.to(device='cuda', non_blocking=True) 125 | terminators = [ 126 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 127 | ] 128 | 129 | with torch.inference_mode(): 130 | output_ids = model.generate( 131 | input_ids, 132 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 133 | do_sample=True if args.temperature > 0 else False, 134 | temperature=args.temperature, 135 | eos_token_id=terminators, 136 | top_p=args.top_p, 137 | num_beams=args.num_beams, 138 | max_new_tokens=1280, 139 | use_cache=True) 140 | 141 | input_token_len = input_ids.shape[1] 142 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 143 | if n_diff_input_output > 0: 144 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 145 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 146 | outputs = outputs.strip() 147 | print(outputs) 148 | 149 | ans_id = shortuuid.uuid() 150 | 151 | all_outputs.append({"question_id": idx, 152 | "prompt": cur_prompt, 153 | "text": outputs, 154 | "answer_id": ans_id, 155 | "model_id": model_name, 156 | "metadata": {}}) 157 | 158 | torch.distributed.barrier() 159 | 160 | world_size = torch.distributed.get_world_size() 161 | merged_outputs = [None for _ in range(world_size)] 162 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 163 | 164 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 165 | 166 | if torch.distributed.get_rank() == 0: 167 | for item in merged_outputs: 168 | ans_file.write(json.dumps(item) + "\n") 169 | ans_file.close() 170 | torch.distributed.barrier() 171 | 172 | if __name__ == "__main__": 173 | parser = argparse.ArgumentParser() 174 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 175 | parser.add_argument("--model-base", type=str, default=None) 176 | parser.add_argument("--image-folder", type=str, default="") 177 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 178 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 179 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 180 | parser.add_argument("--num-chunks", type=int, default=1) 181 | parser.add_argument("--chunk-idx", type=int, default=0) 182 | parser.add_argument("--temperature", type=float, default=0.2) 183 | parser.add_argument("--top_p", type=float, default=None) 184 | parser.add_argument("--num_beams", type=int, default=1) 185 | parser.add_argument("--slide_window", action="store_true") 186 | args = parser.parse_args() 187 | 188 | eval_model(args) 189 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_pope_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | qs = qs.replace("\nAnswer the question using a single word or phrase.", " Please answer yes or no.") # open compass 46 | 47 | input_msg = [ 48 | { 49 | "role": "system", 50 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 51 | }, 52 | { 53 | "role": "user", 54 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 55 | } 56 | ] 57 | input_ids = self.tokenizer.apply_chat_template( 58 | input_msg, 59 | add_generation_prompt=True, 60 | padding="longest", 61 | return_tensors="pt", 62 | ) 63 | 64 | input_id_list = input_ids[0].tolist() 65 | input_id_list[input_id_list.index(128049)]=-200 66 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 67 | 68 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 69 | 70 | if self.args.slide_window: 71 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 72 | else: 73 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 74 | 75 | return input_ids, image_tensor 76 | 77 | def __len__(self): 78 | return len(self.questions) 79 | 80 | 81 | # DataLoader 82 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 83 | assert batch_size == 1, "batch_size must be 1" 84 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 85 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 86 | return data_loader 87 | 88 | 89 | def eval_model(args): 90 | # Model 91 | torch.distributed.init_process_group( 92 | backend='nccl', 93 | world_size=int(os.getenv('WORLD_SIZE', '1')), 94 | rank=int(os.getenv('RANK', '0')), 95 | ) 96 | 97 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 98 | 99 | disable_torch_init() 100 | model_path = os.path.expanduser(args.model_path) 101 | model_name = get_model_name_from_path(model_path) 102 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 103 | tokenizer.pad_token = tokenizer.eos_token 104 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 105 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 106 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 107 | 108 | answers_file = os.path.expanduser(args.answers_file) 109 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 110 | ans_file = open(answers_file, "w") 111 | 112 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 113 | args.conv_mode = args.conv_mode + '_mmtag' 114 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 115 | 116 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 117 | 118 | all_outputs = [] 119 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 120 | idx = line["question_id"] 121 | cur_prompt = line["text"] 122 | 123 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 124 | input_ids = input_ids.to(device='cuda', non_blocking=True) 125 | terminators = [ 126 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 127 | ] 128 | 129 | with torch.inference_mode(): 130 | output_ids = model.generate( 131 | input_ids, 132 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 133 | do_sample=True if args.temperature > 0 else False, 134 | temperature=args.temperature, 135 | eos_token_id=terminators, 136 | top_p=args.top_p, 137 | num_beams=args.num_beams, 138 | max_new_tokens=1280, 139 | use_cache=True) 140 | 141 | input_token_len = input_ids.shape[1] 142 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 143 | if n_diff_input_output > 0: 144 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 145 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 146 | outputs = outputs.strip() 147 | print(outputs) 148 | 149 | ans_id = shortuuid.uuid() 150 | 151 | all_outputs.append({"question_id": idx, 152 | "prompt": cur_prompt, 153 | "text": outputs, 154 | "answer_id": ans_id, 155 | "model_id": model_name, 156 | "metadata": {}}) 157 | 158 | torch.distributed.barrier() 159 | 160 | world_size = torch.distributed.get_world_size() 161 | merged_outputs = [None for _ in range(world_size)] 162 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 163 | 164 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 165 | 166 | if torch.distributed.get_rank() == 0: 167 | for item in merged_outputs: 168 | ans_file.write(json.dumps(item) + "\n") 169 | ans_file.close() 170 | torch.distributed.barrier() 171 | 172 | if __name__ == "__main__": 173 | parser = argparse.ArgumentParser() 174 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 175 | parser.add_argument("--model-base", type=str, default=None) 176 | parser.add_argument("--image-folder", type=str, default="") 177 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 178 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 179 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 180 | parser.add_argument("--num-chunks", type=int, default=1) 181 | parser.add_argument("--chunk-idx", type=int, default=0) 182 | parser.add_argument("--temperature", type=float, default=0.2) 183 | parser.add_argument("--top_p", type=float, default=None) 184 | parser.add_argument("--num_beams", type=int, default=1) 185 | parser.add_argument("--slide_window", action="store_true") 186 | args = parser.parse_args() 187 | 188 | eval_model(args) 189 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_refcoco_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | import random 19 | 20 | 21 | def split_list(lst, n): 22 | """Split a list into n (roughly) equal-sized chunks""" 23 | chunk_size = math.ceil(len(lst) / n) # integer division 24 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 25 | 26 | 27 | def get_chunk(lst, n, k): 28 | chunks = split_list(lst, n) 29 | return chunks[k] 30 | 31 | 32 | # Custom dataset class 33 | class CustomDataset(Dataset): 34 | def __init__(self, questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config, args): 35 | self.questions = questions 36 | self.image_folder = image_folder 37 | self.tokenizer = tokenizer 38 | self.image_processor = image_processor 39 | self.model_config = model_config 40 | 41 | self.is_all_img_resize_672 = is_all_img_resize_672 42 | self.args = args 43 | 44 | 45 | def __getitem__(self, index): 46 | line = self.questions[index] 47 | image_file = line["img_path"] 48 | 49 | qs = 'Please provide the bounding box coordinate of the region this sentence describes: '+line["expression"] 50 | 51 | input_msg = [ 52 | { 53 | "role": "system", 54 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 55 | }, 56 | { 57 | "role": "user", 58 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 59 | } 60 | ] 61 | input_ids = self.tokenizer.apply_chat_template( 62 | input_msg, 63 | add_generation_prompt=True, 64 | padding="longest", 65 | return_tensors="pt", 66 | ) 67 | # print(input_ids) 68 | input_id_list = input_ids[0].tolist() 69 | input_id_list[input_id_list.index(128049)]=-200 70 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 71 | 72 | 73 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 74 | 75 | if self.args.slide_window: 76 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 77 | else: 78 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 79 | 80 | return input_ids, image_tensor 81 | 82 | def __len__(self): 83 | return len(self.questions) 84 | 85 | 86 | 87 | # DataLoader 88 | def create_data_loader(questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=8): 89 | assert batch_size == 1, "batch_size must be 1" 90 | dataset = CustomDataset(questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config,args) 91 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=None,) 92 | return data_loader 93 | 94 | 95 | def eval_model(args): 96 | # Model 97 | 98 | 99 | torch.distributed.init_process_group( 100 | backend='nccl', 101 | world_size=int(os.getenv('WORLD_SIZE', '1')), 102 | rank=int(os.getenv('RANK', '0')), 103 | ) 104 | 105 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 106 | 107 | 108 | disable_torch_init() 109 | 110 | model_path = os.path.expanduser(args.model_path) 111 | model_name = get_model_name_from_path(model_path) 112 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 113 | tokenizer.pad_token = tokenizer.eos_token 114 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 115 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 116 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 117 | 118 | answers_file = os.path.expanduser(args.answers_file) 119 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 120 | ans_file = open(answers_file, "w") 121 | 122 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 123 | args.conv_mode = args.conv_mode + '_mmtag' 124 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 125 | 126 | data_loader = create_data_loader(questions, args.image_folder, args.is_all_img_resize_672, tokenizer, image_processor, model.config, args) 127 | 128 | 129 | all_outputs = [] 130 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 131 | 132 | idx = line["sent_id"] 133 | cur_prompt = line["expression"] 134 | 135 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 136 | input_ids = input_ids.to(device='cuda', non_blocking=True) 137 | terminators = [ 138 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 139 | ] 140 | 141 | with torch.inference_mode(): 142 | output_ids = model.generate( 143 | input_ids, 144 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 145 | do_sample=True if args.temperature > 0 else False, 146 | temperature=args.temperature, 147 | eos_token_id=terminators, 148 | top_p=args.top_p, 149 | num_beams=args.num_beams, 150 | max_new_tokens=128, 151 | use_cache=True) 152 | 153 | input_token_len = input_ids.shape[1] 154 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 155 | if n_diff_input_output > 0: 156 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 157 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 158 | outputs = outputs.strip() 159 | 160 | print('expression:', cur_prompt) 161 | print('output:', outputs) 162 | 163 | ans_id = shortuuid.uuid() 164 | all_outputs.append({"question_id": idx, 165 | "prompt": cur_prompt, 166 | "text": outputs, 167 | "answer_id": ans_id, 168 | "model_id": model_name, 169 | "metadata": {}}) 170 | 171 | torch.distributed.barrier() 172 | 173 | world_size = torch.distributed.get_world_size() 174 | merged_outputs = [None for _ in range(world_size)] 175 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 176 | 177 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 178 | 179 | if torch.distributed.get_rank() == 0: 180 | for item in merged_outputs: 181 | ans_file.write(json.dumps(item, ensure_ascii=False) + "\n") 182 | ans_file.close() 183 | torch.distributed.barrier() 184 | 185 | if __name__ == "__main__": 186 | parser = argparse.ArgumentParser() 187 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 188 | parser.add_argument("--model-base", type=str, default=None) 189 | parser.add_argument("--image-folder", type=str, default="") 190 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 191 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 192 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 193 | parser.add_argument("--num-chunks", type=int, default=1) 194 | parser.add_argument("--chunk-idx", type=int, default=0) 195 | parser.add_argument("--temperature", type=float, default=0.2) 196 | parser.add_argument("--patch_img_size", type=int, default=224) 197 | parser.add_argument("--top_p", type=float, default=None) 198 | parser.add_argument("--num_beams", type=int, default=1) 199 | parser.add_argument("--is_all_img_resize_672", type=bool, default=False) 200 | parser.add_argument("--slide_window", action="store_true") 201 | args = parser.parse_args() 202 | 203 | eval_model(args) 204 | -------------------------------------------------------------------------------- /qh360_vl/eval/model_vqa_textvqa_llama3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | import itertools 8 | 9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle 11 | from qh360_vl.model.builder import load_pretrained_model 12 | from qh360_vl.utils import disable_torch_init 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window 14 | from torch.utils.data import Dataset, DataLoader 15 | 16 | from PIL import Image 17 | import math 18 | 19 | 20 | def split_list(lst, n): 21 | """Split a list into n (roughly) equal-sized chunks""" 22 | chunk_size = math.ceil(len(lst) / n) # integer division 23 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 24 | 25 | 26 | def get_chunk(lst, n, k): 27 | chunks = split_list(lst, n) 28 | return chunks[k] 29 | 30 | 31 | # Custom dataset class 32 | class CustomDataset(Dataset): 33 | def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args): 34 | self.questions = questions 35 | self.image_folder = image_folder 36 | self.tokenizer = tokenizer 37 | self.image_processor = image_processor 38 | self.model_config = model_config 39 | self.args = args 40 | 41 | def __getitem__(self, index): 42 | line = self.questions[index] 43 | image_file = line["image"] 44 | qs = line["text"] 45 | 46 | input_msg = [ 47 | { 48 | "role": "system", 49 | "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information." 50 | }, 51 | { 52 | "role": "user", 53 | "content": "<|reserved_special_token_44|>"+ '\n' + qs 54 | } 55 | ] 56 | input_ids = self.tokenizer.apply_chat_template( 57 | input_msg, 58 | add_generation_prompt=True, 59 | padding="longest", 60 | return_tensors="pt", 61 | ) 62 | 63 | input_id_list = input_ids[0].tolist() 64 | input_id_list[input_id_list.index(128049)]=-200 65 | input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device) 66 | 67 | image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') 68 | 69 | if self.args.slide_window: 70 | image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336) 71 | else: 72 | image_tensor = process_images([image], self.image_processor, self.model_config)[0] 73 | 74 | return input_ids, image_tensor 75 | 76 | def __len__(self): 77 | return len(self.questions) 78 | 79 | 80 | # DataLoader 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4): 82 | assert batch_size == 1, "batch_size must be 1" 83 | dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args) 84 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False) 85 | return data_loader 86 | 87 | 88 | def eval_model(args): 89 | # Model 90 | torch.distributed.init_process_group( 91 | backend='nccl', 92 | world_size=int(os.getenv('WORLD_SIZE', '1')), 93 | rank=int(os.getenv('RANK', '0')), 94 | ) 95 | 96 | torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0))) 97 | 98 | disable_torch_init() 99 | model_path = os.path.expanduser(args.model_path) 100 | model_name = get_model_name_from_path(model_path) 101 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 102 | tokenizer.pad_token = tokenizer.eos_token 103 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 104 | # questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 105 | questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank()) 106 | 107 | answers_file = os.path.expanduser(args.answers_file) 108 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 109 | ans_file = open(answers_file, "w") 110 | 111 | if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: 112 | args.conv_mode = args.conv_mode + '_mmtag' 113 | print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') 114 | 115 | data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args) 116 | 117 | all_outputs = [] 118 | for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)): 119 | idx = line["question_id"] 120 | cur_prompt = line["text"] 121 | 122 | # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2 123 | input_ids = input_ids.to(device='cuda', non_blocking=True) 124 | terminators = [ 125 | tokenizer.convert_tokens_to_ids("<|eot_id|>",) 126 | ] 127 | 128 | with torch.inference_mode(): 129 | output_ids = model.generate( 130 | input_ids, 131 | images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), 132 | do_sample=True if args.temperature > 0 else False, 133 | temperature=args.temperature, 134 | eos_token_id=terminators, 135 | top_p=args.top_p, 136 | num_beams=args.num_beams, 137 | max_new_tokens=1280, 138 | use_cache=True) 139 | 140 | input_token_len = input_ids.shape[1] 141 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 142 | if n_diff_input_output > 0: 143 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 144 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 145 | outputs = outputs.strip() 146 | print(outputs) 147 | 148 | ans_id = shortuuid.uuid() 149 | 150 | all_outputs.append({"question_id": idx, 151 | "prompt": cur_prompt, 152 | "text": outputs, 153 | "answer_id": ans_id, 154 | "model_id": model_name, 155 | "metadata": {}}) 156 | 157 | torch.distributed.barrier() 158 | 159 | world_size = torch.distributed.get_world_size() 160 | merged_outputs = [None for _ in range(world_size)] 161 | torch.distributed.all_gather_object(merged_outputs, all_outputs) 162 | 163 | merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)] 164 | 165 | if torch.distributed.get_rank() == 0: 166 | for item in merged_outputs: 167 | ans_file.write(json.dumps(item) + "\n") 168 | ans_file.close() 169 | torch.distributed.barrier() 170 | 171 | if __name__ == "__main__": 172 | parser = argparse.ArgumentParser() 173 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 174 | parser.add_argument("--model-base", type=str, default=None) 175 | parser.add_argument("--image-folder", type=str, default="") 176 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 177 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 178 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 179 | parser.add_argument("--num-chunks", type=int, default=1) 180 | parser.add_argument("--chunk-idx", type=int, default=0) 181 | parser.add_argument("--temperature", type=float, default=0.2) 182 | parser.add_argument("--top_p", type=float, default=None) 183 | parser.add_argument("--num_beams", type=int, default=1) 184 | parser.add_argument("--slide_window", action="store_true") 185 | args = parser.parse_args() 186 | 187 | eval_model(args) 188 | -------------------------------------------------------------------------------- /qh360_vl/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /qh360_vl/mm_utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from io import BytesIO 3 | import base64 4 | 5 | import torch 6 | from transformers import StoppingCriteria 7 | from qh360_vl.constants import IMAGE_TOKEN_INDEX 8 | 9 | 10 | def load_image_from_base64(image): 11 | return Image.open(BytesIO(base64.b64decode(image))) 12 | 13 | 14 | def expand2square(pil_img, background_color): 15 | width, height = pil_img.size 16 | if width == height: 17 | return pil_img 18 | elif width > height: 19 | result = Image.new(pil_img.mode, (width, width), background_color) 20 | result.paste(pil_img, (0, (width - height) // 2)) 21 | return result 22 | else: 23 | result = Image.new(pil_img.mode, (height, height), background_color) 24 | result.paste(pil_img, ((height - width) // 2, 0)) 25 | return result 26 | 27 | 28 | def process_images(images, image_processor, model_cfg): 29 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 30 | new_images = [] 31 | if image_aspect_ratio == 'pad': 32 | for image in images: 33 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 34 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 35 | new_images.append(image) 36 | else: 37 | return image_processor(images, return_tensors='pt')['pixel_values'] 38 | if all(x.shape == new_images[0].shape for x in new_images): 39 | new_images = torch.stack(new_images, dim=0) 40 | return new_images 41 | 42 | def get_proper_imgsize(pil_img, vit_is): 43 | max_w_h = vit_is * 2 44 | new_pil_img = pil_img.resize((max_w_h, max_w_h)) 45 | return new_pil_img 46 | 47 | def tensor_crop(tensor_array, left, upper, right, lower): 48 | # tensor_array: C * H * W 49 | return tensor_array[:, upper:lower, left:right] 50 | 51 | def image_slid_window(image, num_slid_window): 52 | # image: tensor, 3 * 336 * 336 or 3 * 672 * 672 53 | # image: tensor, 3 * 224 * 224 or 3 * 448 * 448 54 | if num_slid_window == 5: 55 | image_x2, image_x1 = image[0], image[1] 56 | vit_is = image_x1.shape[1] 57 | h, w = image_x2.shape[1],image_x2.shape[2] 58 | image0 = tensor_crop(image_x2, 0, 0, vit_is, vit_is) 59 | image1 = tensor_crop(image_x2, w-vit_is, 0, w, vit_is) 60 | image2 = tensor_crop(image_x2, 0, h-vit_is, vit_is, h) 61 | image3 = tensor_crop(image_x2, w-vit_is, h-vit_is, w, h) 62 | return torch.stack([image0, image1, image2, image3, image_x1]) 63 | else: 64 | return image 65 | 66 | def process_images_slid_window(image, image_processor, model_cfg, is_maintain_orig_img_token, is_all_img_resize_672, vit_is): 67 | vit_is = vit_is # vit_input_size, for simplicity 68 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 69 | 70 | num_slid_window = 5 71 | 72 | if image_aspect_ratio == 'pad': 73 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 74 | image = get_proper_imgsize(image, vit_is) 75 | image_x2 = image_processor.preprocess(image, return_tensors='pt', do_resize=False, do_center_crop=False)['pixel_values'][0] 76 | image_x1 = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 77 | image = [image_x2, image_x1] 78 | else: 79 | image = get_proper_imgsize(image, vit_is) 80 | image_x2 = image_processor.preprocess(image, return_tensors='pt', do_resize=False, do_center_crop=False)['pixel_values'][0] 81 | image_x1 = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 82 | image = [image_x2, image_x1] 83 | 84 | image = image_slid_window(image, num_slid_window) 85 | 86 | return image 87 | 88 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 89 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 90 | 91 | def insert_separator(X, sep): 92 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 93 | 94 | input_ids = [] 95 | offset = 0 96 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 97 | offset = 1 98 | input_ids.append(prompt_chunks[0][0]) 99 | 100 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 101 | input_ids.extend(x[offset:]) 102 | 103 | if return_tensors is not None: 104 | if return_tensors == 'pt': 105 | return torch.tensor(input_ids, dtype=torch.long) 106 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 107 | return input_ids 108 | 109 | # def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 110 | # # prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 111 | # prompt_chunks = [] # compatible with transformers>=4.32.0 112 | # for chunk in prompt.split(''): 113 | # if len(chunk) > 0: 114 | # prompt_chunks.append(tokenizer(chunk).input_ids) 115 | # else: 116 | # prompt_chunks.append([tokenizer.bos_token_id]) 117 | 118 | # def insert_separator(X, sep): 119 | # return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 120 | 121 | # input_ids = [] 122 | # offset = 0 123 | # if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 124 | # offset = 1 125 | # input_ids.append(prompt_chunks[0][0]) 126 | 127 | # for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 128 | # input_ids.extend(x[offset:]) 129 | 130 | # if return_tensors is not None: 131 | # if return_tensors == 'pt': 132 | # return torch.tensor(input_ids, dtype=torch.long) 133 | # raise ValueError(f'Unsupported tensor type: {return_tensors}') 134 | # return input_ids 135 | 136 | def get_model_name_from_path(model_path): 137 | model_path = model_path.strip("/") 138 | model_paths = model_path.split("/") 139 | if model_paths[-1].startswith('checkpoint-'): 140 | return model_paths[-2] + "_" + model_paths[-1] 141 | else: 142 | return model_paths[-1] 143 | 144 | 145 | 146 | 147 | class KeywordsStoppingCriteria(StoppingCriteria): 148 | def __init__(self, keywords, tokenizer, input_ids): 149 | self.keywords = keywords 150 | self.keyword_ids = [] 151 | self.max_keyword_len = 0 152 | for keyword in keywords: 153 | cur_keyword_ids = tokenizer(keyword).input_ids 154 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 155 | cur_keyword_ids = cur_keyword_ids[1:] 156 | if len(cur_keyword_ids) > self.max_keyword_len: 157 | self.max_keyword_len = len(cur_keyword_ids) 158 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 159 | self.tokenizer = tokenizer 160 | self.start_len = input_ids.shape[1] 161 | 162 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 163 | assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO 164 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 165 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 166 | for keyword_id in self.keyword_ids: 167 | if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all(): 168 | return True 169 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 170 | for keyword in self.keywords: 171 | if keyword in outputs: 172 | return True 173 | return False -------------------------------------------------------------------------------- /qh360_vl/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.QH360_VL_llama import QH360_VL_LlamaForCausalLM, QH360_VLConfig 2 | -------------------------------------------------------------------------------- /qh360_vl/model/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | import warnings 18 | import shutil 19 | 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig 21 | import torch 22 | from qh360_vl.model import * 23 | from qh360_vl.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 24 | import transformers 25 | 26 | def load_pretrained_model(model_path, model_base, model_name, load_bf16=False, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"): 27 | kwargs = {"device_map": device_map} 28 | 29 | if load_8bit: 30 | kwargs['load_in_8bit'] = True 31 | elif load_4bit: 32 | kwargs['load_in_4bit'] = True 33 | kwargs['quantization_config'] = BitsAndBytesConfig( 34 | load_in_4bit=True, 35 | bnb_4bit_compute_dtype=torch.float16, 36 | bnb_4bit_use_double_quant=True, 37 | bnb_4bit_quant_type='nf4' 38 | ) 39 | elif load_bf16: 40 | kwargs['torch_dtype'] = torch.bfloat16 41 | else: 42 | kwargs['torch_dtype'] = torch.float16 43 | 44 | 45 | if '360vl' in model_name.lower(): 46 | # Load qh_360vl model 47 | if 'lora' in model_name.lower() and model_base is None: 48 | warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/qh_360vl#launch-a-model-worker-lora-weights-unmerged.') 49 | if 'lora' in model_name.lower() and model_base is not None: 50 | lora_cfg_pretrained = AutoConfig.from_pretrained(model_path) 51 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 52 | print('Loading qh_360vl from base model...') 53 | model = QH360_VL_LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) 54 | token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features 55 | if model.lm_head.weight.shape[0] != token_num: 56 | model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 57 | model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) 58 | 59 | print('Loading additional qh_360vl weights...') 60 | if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): 61 | non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') 62 | else: 63 | # this is probably from HF Hub 64 | from huggingface_hub import hf_hub_download 65 | def load_from_hf(repo_id, filename, subfolder=None): 66 | cache_file = hf_hub_download( 67 | repo_id=repo_id, 68 | filename=filename, 69 | subfolder=subfolder) 70 | return torch.load(cache_file, map_location='cpu') 71 | non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') 72 | non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} 73 | if any(k.startswith('model.model.') for k in non_lora_trainables): 74 | non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} 75 | model.load_state_dict(non_lora_trainables, strict=False) 76 | 77 | from peft import PeftModel 78 | print('Loading LoRA weights...') 79 | model = PeftModel.from_pretrained(model, model_path) 80 | print('Merging LoRA weights...') 81 | model = model.merge_and_unload() 82 | print('Model is loaded...') 83 | elif model_base is not None: 84 | # this may be mm projector only 85 | print('Loading qh_360vl from base model...') 86 | if 'mpt' in model_name.lower(): 87 | if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): 88 | shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py')) 89 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True) 90 | cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 91 | model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 92 | else: 93 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 94 | cfg_pretrained = AutoConfig.from_pretrained(model_path) 95 | model = QH360_VL_LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) 96 | 97 | mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') 98 | mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} 99 | model.load_state_dict(mm_projector_weights, strict=False) 100 | else: 101 | if 'mpt' in model_name.lower(): 102 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 103 | model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 104 | else: 105 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 106 | model = QH360_VL_LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 107 | else: 108 | # Load language model 109 | if model_base is not None: 110 | # PEFT model 111 | from peft import PeftModel 112 | tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) 113 | model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto") 114 | print(f"Loading LoRA weights from {model_path}") 115 | model = PeftModel.from_pretrained(model, model_path) 116 | print(f"Merging weights") 117 | model = model.merge_and_unload() 118 | print('Convert to FP16...') 119 | model.to(torch.float16) 120 | else: 121 | use_fast = False 122 | if 'mpt' in model_name.lower(): 123 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) 124 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) 125 | else: 126 | tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) 127 | model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) 128 | 129 | image_processor = None 130 | 131 | if '360vl' in model_name.lower(): 132 | mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) 133 | mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) 134 | if mm_use_im_patch_token: 135 | tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) 136 | if mm_use_im_start_end: 137 | tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) 138 | model.resize_token_embeddings(len(tokenizer)) 139 | 140 | vision_tower = model.get_vision_tower() 141 | if not vision_tower.is_loaded: 142 | vision_tower.load_model() 143 | 144 | vision_tower.to(device=device, dtype=torch.float16) 145 | image_processor = vision_tower.image_processor 146 | 147 | if hasattr(model.config, "max_sequence_length"): 148 | context_len = model.config.max_sequence_length 149 | else: 150 | context_len = 2048 151 | 152 | return tokenizer, model, image_processor, context_len 153 | -------------------------------------------------------------------------------- /qh360_vl/model/language_model/QH360_VL_llama.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Optional, Tuple, Union 17 | 18 | import torch 19 | import torch.nn as nn 20 | 21 | from torch.nn import CrossEntropyLoss 22 | 23 | from transformers import AutoConfig, AutoModelForCausalLM, \ 24 | LlamaConfig, LlamaModel, LlamaForCausalLM 25 | 26 | from transformers.modeling_outputs import CausalLMOutputWithPast 27 | 28 | from ..QH360_VL_arch_cc import QH360_VL_MetaModel, QH360_VL_MetaForCausalLM 29 | 30 | 31 | class QH360_VLConfig(LlamaConfig): 32 | model_type = "QH_360VL" 33 | 34 | 35 | class QH360_VL_LlamaModel(QH360_VL_MetaModel, LlamaModel): 36 | config_class = QH360_VLConfig 37 | 38 | def __init__(self, config: LlamaConfig): 39 | super(QH360_VL_LlamaModel, self).__init__(config) 40 | 41 | 42 | class QH360_VL_LlamaForCausalLM(LlamaForCausalLM, QH360_VL_MetaForCausalLM): 43 | config_class = QH360_VLConfig 44 | 45 | def __init__(self, config): 46 | super(LlamaForCausalLM, self).__init__(config) 47 | config._attn_implementation = "flash_attention_2" 48 | self.model = QH360_VL_LlamaModel(config) 49 | 50 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 51 | 52 | # Initialize weights and apply final processing 53 | self.post_init() 54 | 55 | def get_model(self): 56 | return self.model 57 | 58 | def forward( 59 | self, 60 | input_ids: torch.LongTensor = None, 61 | attention_mask: Optional[torch.Tensor] = None, 62 | past_key_values: Optional[List[torch.FloatTensor]] = None, 63 | inputs_embeds: Optional[torch.FloatTensor] = None, 64 | labels: Optional[torch.LongTensor] = None, 65 | use_cache: Optional[bool] = None, 66 | output_attentions: Optional[bool] = None, 67 | output_hidden_states: Optional[bool] = None, 68 | images: Optional[torch.FloatTensor] = None, 69 | return_dict: Optional[bool] = None, 70 | ) -> Union[Tuple, CausalLMOutputWithPast]: 71 | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions 72 | output_hidden_states = ( 73 | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states 74 | ) 75 | return_dict = return_dict if return_dict is not None else self.config.use_return_dict 76 | 77 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 78 | 79 | # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) 80 | outputs = self.model( 81 | input_ids=input_ids, 82 | attention_mask=attention_mask, 83 | past_key_values=past_key_values, 84 | inputs_embeds=inputs_embeds, 85 | use_cache=use_cache, 86 | output_attentions=output_attentions, 87 | output_hidden_states=output_hidden_states, 88 | return_dict=return_dict 89 | ) 90 | 91 | hidden_states = outputs[0] 92 | logits = self.lm_head(hidden_states) 93 | 94 | loss = None 95 | if labels is not None: 96 | # Shift so that tokens < n predict n 97 | shift_logits = logits[..., :-1, :].contiguous() 98 | shift_labels = labels[..., 1:].contiguous() 99 | # Flatten the tokens 100 | loss_fct = CrossEntropyLoss() 101 | shift_logits = shift_logits.view(-1, self.config.vocab_size) 102 | shift_labels = shift_labels.view(-1) 103 | # Enable model/pipeline parallelism 104 | shift_labels = shift_labels.to(shift_logits.device) 105 | loss = loss_fct(shift_logits, shift_labels) 106 | 107 | if not return_dict: 108 | output = (logits,) + outputs[1:] 109 | return (loss,) + output if loss is not None else output 110 | 111 | return CausalLMOutputWithPast( 112 | loss=loss, 113 | logits=logits, 114 | past_key_values=outputs.past_key_values, 115 | hidden_states=outputs.hidden_states, 116 | attentions=outputs.attentions, 117 | ) 118 | 119 | def prepare_inputs_for_generation( 120 | self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs 121 | ): 122 | if past_key_values: 123 | input_ids = input_ids[:, -1:] 124 | 125 | # if `inputs_embeds` are passed, we only want to use them in the 1st generation step 126 | if inputs_embeds is not None and past_key_values is None: 127 | model_inputs = {"inputs_embeds": inputs_embeds} 128 | else: 129 | model_inputs = {"input_ids": input_ids} 130 | 131 | model_inputs.update( 132 | { 133 | "past_key_values": past_key_values, 134 | "use_cache": kwargs.get("use_cache"), 135 | "attention_mask": attention_mask, 136 | "images": kwargs.get("images", None), 137 | } 138 | ) 139 | return model_inputs 140 | 141 | AutoConfig.register("QH_360VL", QH360_VLConfig) 142 | AutoModelForCausalLM.register(QH360_VLConfig, QH360_VL_LlamaForCausalLM) 143 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | 9 | 10 | # if 'clip' in vision_tower.lower(): 11 | # return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 12 | 13 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 14 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 15 | 16 | raise ValueError(f'Unknown vision tower: {vision_tower}') 17 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | else: 20 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 21 | 22 | def load_model(self): 23 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 24 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 25 | self.vision_tower.requires_grad_(False) 26 | 27 | self.is_loaded = True 28 | 29 | def feature_select(self, image_forward_outs): 30 | image_features = image_forward_outs.hidden_states[self.select_layer] 31 | if self.select_feature == 'patch': 32 | image_features = image_features[:, 1:] 33 | elif self.select_feature == 'cls_patch': 34 | image_features = image_features 35 | else: 36 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 37 | return image_features 38 | 39 | @torch.no_grad() 40 | def forward(self, images): 41 | if type(images) is list: 42 | image_features = [] 43 | for image in images: 44 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 45 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 46 | image_features.append(image_feature) 47 | else: 48 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 49 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 50 | 51 | return image_features 52 | 53 | @property 54 | def dummy_feature(self): 55 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 56 | 57 | @property 58 | def dtype(self): 59 | return self.vision_tower.dtype 60 | 61 | @property 62 | def device(self): 63 | return self.vision_tower.device 64 | 65 | @property 66 | def config(self): 67 | if self.is_loaded: 68 | return self.vision_tower.config 69 | else: 70 | return self.cfg_only 71 | 72 | @property 73 | def hidden_size(self): 74 | return self.config.hidden_size 75 | 76 | @property 77 | def num_patches(self): 78 | return (self.config.image_size // self.config.patch_size) ** 2 79 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | from .projectors import CAbstractor 5 | from transformers import PretrainedConfig 6 | from .configuration_honeybee import HoneybeeConfig,HoneybeeVisualProjectorConfig 7 | import torch.nn.functional as F 8 | 9 | class IdentityMap(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | @property 17 | def config(self): 18 | return {"mm_projector_type": 'identity'} 19 | 20 | 21 | class SimpleResBlock(nn.Module): 22 | def __init__(self, channels): 23 | super().__init__() 24 | self.pre_norm = nn.LayerNorm(channels) 25 | 26 | self.proj = nn.Sequential( 27 | nn.Linear(channels, channels), 28 | nn.GELU(), 29 | nn.Linear(channels, channels) 30 | ) 31 | def forward(self, x): 32 | x = self.pre_norm(x) 33 | return x + self.proj(x) 34 | 35 | 36 | def build_honeybee_projector(config, projector_type, num_tokens,lm_hidden_size): 37 | """Build projector (abstractor) and query_tokens (optionally for resampler)""" 38 | proj_config = config 39 | proj_type = projector_type 40 | num_tokens = num_tokens 41 | output_hidden_size = lm_hidden_size # LM hidden size 42 | 43 | abstractor = { 44 | "c-abs": CAbstractor, 45 | }[ 46 | proj_type 47 | ](proj_config, num_tokens, output_hidden_size) 48 | return abstractor 49 | 50 | 51 | def build_vision_projector(config, delay_load=False, **kwargs): 52 | projector_type = getattr(config, 'mm_projector_type', 'linear') 53 | 54 | if projector_type == 'linear': 55 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 56 | 57 | if projector_type == 'c-abs': 58 | 59 | local_config_path = config.mm_projector_config 60 | honeybee_config = HoneybeeVisualProjectorConfig.from_pretrained(local_config_path) 61 | 62 | num_tokens = config.mm_num_tokens 63 | 64 | lm_hidden_size = config.hidden_size 65 | 66 | abstractor = build_honeybee_projector(honeybee_config,projector_type,num_tokens,lm_hidden_size) 67 | return abstractor 68 | 69 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 70 | if mlp_gelu_match: 71 | mlp_depth = int(mlp_gelu_match.group(1)) 72 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 73 | for _ in range(1, mlp_depth): 74 | modules.append(nn.GELU()) 75 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 76 | return nn.Sequential(*modules) 77 | 78 | if projector_type == 'identity': 79 | return IdentityMap() 80 | 81 | raise ValueError(f'Unknown projector type: {projector_type}') 82 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/pipeline/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import yaml 4 | from omegaconf import DictConfig, OmegaConf 5 | 6 | 7 | class AttrDict(dict): 8 | __setattr__ = dict.__setitem__ 9 | 10 | def __getattribute__(self, item): 11 | if item in self: 12 | return self[item] 13 | else: 14 | return super().__getattribute__(item) 15 | 16 | @classmethod 17 | def from_nested_dicts(cls, data): 18 | if not isinstance(data, dict): 19 | return data 20 | else: 21 | return cls({key: cls.from_nested_dicts(data[key]) for key in data}) 22 | 23 | def asdict(self): 24 | def _asdict(data): 25 | if not isinstance(data, dict): 26 | return data 27 | else: 28 | return {key: _asdict(data[key]) for key in data} 29 | 30 | return _asdict(self) 31 | 32 | 33 | def save_config(cfg): 34 | """Save config to `config.output_dir/exp_config.yaml`. 35 | """ 36 | output_dir = cfg.output_dir 37 | if isinstance(cfg, AttrDict): 38 | cfg = cfg.asdict() # AttrDict does not work with OmegaConf.to_yaml 39 | 40 | out_conf_dir = os.path.dirname(output_dir) 41 | os.makedirs(out_conf_dir, exist_ok=True) 42 | with open(os.path.join(output_dir, "exp_config.yaml"), "w") as fout: 43 | fout.write(OmegaConf.to_yaml(cfg) + "\n") 44 | 45 | 46 | def set_config(cfg: DictConfig, save: bool = False) -> AttrDict: 47 | # convert DictConfig to AttrDict 48 | # - it is slow to access DictConfig 49 | # - DictConfig makes an unresolved error: 50 | # `RuntimeError: DataLoader worker (pid 7103) is killed by signal: Aborted`. 51 | 52 | OmegaConf.resolve(cfg) 53 | 54 | if save: 55 | # config loaded by hydra is saved to /exp_config.yaml 56 | save_config(cfg) 57 | 58 | cfg = OmegaConf.to_container(cfg) 59 | cfg = AttrDict.from_nested_dicts(cfg) 60 | return cfg 61 | 62 | 63 | def load_config(cfg_path: str) -> AttrDict: 64 | with open(cfg_path, "r") as fin: 65 | cfg = yaml.load(fin, Loader=yaml.FullLoader) 66 | cfg = AttrDict.from_nested_dicts(cfg) 67 | return cfg 68 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/pipeline/data_utils/special_tokens.py: -------------------------------------------------------------------------------- 1 | SYSTEM = "The following is a conversation between a curious human and AI assistant." 2 | SYSTEM_DETAIL = "The assistant gives helpful, detailed, and polite answers to the user's questions." 3 | ORG_SYSTEM = SYSTEM + " " + SYSTEM_DETAIL 4 | IMAGE = "" 5 | _MEDIA_TOKENS = {"image": [IMAGE]} # Special tokens used in this codebase. 6 | # Role pattern tokens 7 | HUMAN = "Human: " 8 | AI = "AI: " 9 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/pipeline/data_utils/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def idx2option(idx: int, style="upper", deco="dot"): 4 | """ 5 | idx: [0, N-1] 6 | style: upper, lower, num 7 | deco: None, paren, dot, rparen 8 | """ 9 | idx = { 10 | "upper": chr(ord("A") + idx), 11 | "lower": chr(ord("a") + idx), 12 | "num": f"{idx + 1}", 13 | }[style] 14 | 15 | idx = { 16 | None: "{idx}", 17 | "paren": "({idx})", 18 | "dot": "{idx}.", 19 | "rparen": "{idx})", 20 | }[deco].format(idx=idx) 21 | 22 | return idx 23 | 24 | 25 | def optionize( 26 | options: list[str], 27 | answer_idx: int, 28 | shuffle=False, 29 | aug_idx_style=False, 30 | include_answer_str=False, 31 | sep="\n" 32 | ) -> (str, str): 33 | """Convert options (list of str) to option string. 34 | This process also includes: 35 | - option shuffling 36 | - index augmentation 37 | Args: 38 | options (list[str]) 39 | answer_idx (int) 40 | shuffle (bool): shuffle options 41 | aug_idx_style (bool): randomly choose index style 42 | Aug examples: (1) / 1. / (A) / A. 43 | include_answer_str (bool): include answer string 44 | False: A 45 | True: A. {answer} 46 | Return: 47 | (option_str, answer_str) 48 | """ 49 | if isinstance(options, str): 50 | # already optionized 51 | return options 52 | 53 | answer = options[answer_idx] 54 | if shuffle: 55 | random.shuffle(options) 56 | answer_idx = options.index(answer) 57 | 58 | if not aug_idx_style: 59 | style = "upper" 60 | deco = "dot" 61 | else: 62 | style = random.choice(["upper", "lower", "num"]) 63 | deco = random.choice(["paren", "dot", "rparen"]) 64 | 65 | indices = [idx2option(i, style=style, deco=deco) for i in range(len(options))] 66 | answer_str = idx2option(answer_idx, style=style, deco=None) 67 | if include_answer_str: 68 | answer_str = f"{answer_str}. {answer}" 69 | 70 | options_with_index = [ 71 | f"{idx} {option}" 72 | for idx, option in zip(indices, options) 73 | ] 74 | option_str = sep.join(options_with_index) 75 | return option_str, answer_str 76 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/pipeline/interface.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from peft import PeftConfig, PeftModel 5 | from PIL import Image 6 | from transformers import AutoTokenizer 7 | from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD 8 | from pathlib import Path 9 | 10 | from honeybee.modeling_honeybee import HoneybeeForConditionalGeneration 11 | from honeybee.processing_honeybee import HoneybeeImageProcessor, HoneybeeProcessor 12 | 13 | 14 | def load_model(pretrained_ckpt, use_bf16=True, load_in_8bit=False): 15 | """Model loader. 16 | 17 | Args: 18 | pretrained_ckpt (string): The path to pre-trained checkpoint. 19 | use_bf16 (bool, optional): Whether to use bfloat16 to load the model. (Default: True) 20 | load_in_8bit(bool, optional): Flag to load model in 8it. (Default: False) 21 | 22 | Returns: 23 | model: Honeybee Model 24 | """ 25 | 26 | # we check whether the model is trained using PEFT 27 | # by checking existance of 'adapter_config.json' is in pretrained_ckpt folder. 28 | is_peft = os.path.exists(os.path.join(pretrained_ckpt, "adapter_config.json")) 29 | 30 | if is_peft: 31 | # when using checkpoints trained using PEFT (by us) 32 | config = PeftConfig.from_pretrained(pretrained_ckpt) 33 | if config.base_model_name_or_path == "": 34 | # when pre-training, there is no definition of base_model_name_or_path 35 | # but, we saved the base model at /base 36 | config.base_model_name_or_path = os.path.join(os.path.dirname(pretrained_ckpt), "base") 37 | 38 | base_model = HoneybeeForConditionalGeneration.from_pretrained( 39 | config.base_model_name_or_path, 40 | load_in_8bit=load_in_8bit, 41 | torch_dtype=torch.bfloat16 if use_bf16 else torch.half, 42 | # avoiding RuntimeError: Expected all tensors to be on the same device 43 | device_map={"": int(os.environ.get("LOCAL_RANK", 0))}, 44 | ) 45 | model = PeftModel.from_pretrained( 46 | base_model, 47 | pretrained_ckpt, 48 | is_trainable=True, 49 | torch_dtype=torch.bfloat16 if use_bf16 else torch.half, 50 | ) 51 | else: 52 | # when using original mllm checkpoints 53 | model = HoneybeeForConditionalGeneration.from_pretrained( 54 | pretrained_ckpt, 55 | torch_dtype=torch.bfloat16 if use_bf16 else torch.half, 56 | ) 57 | return model 58 | 59 | 60 | def get_model(pretrained_ckpt, use_bf16=True, load_in_8bit=False): 61 | """Model Provider with tokenizer and processor. 62 | 63 | Args: 64 | pretrained_ckpt (string): The path to pre-trained checkpoint. 65 | use_bf16 (bool, optional): Whether to use bfloat16 to load the model. (Default: True) 66 | load_in_8bit(bool, optional): Flag to load model in 8it. (Default: False) 67 | 68 | Returns: 69 | model: Honeybee Model 70 | tokenizer: Honeybee (Llama) text tokenizer 71 | processor: Honeybee processor (including text and image) 72 | """ 73 | # Load model where base_ckpt is different when the target model is trained by PEFT 74 | model = load_model(pretrained_ckpt, use_bf16, load_in_8bit) 75 | 76 | image_size = model.config.vision_config.image_size 77 | num_query_tokens = model.config.num_query_tokens 78 | num_eos_tokens = getattr(model.config.visual_projector_config, "num_eos_tokens", 1) 79 | num_visual_tokens = num_query_tokens + num_eos_tokens 80 | 81 | # Build processor 82 | image_processor = HoneybeeImageProcessor( 83 | size=image_size, 84 | crop_size=image_size, 85 | image_mean=OPENAI_CLIP_MEAN, 86 | image_std=OPENAI_CLIP_STD, 87 | ) 88 | # Load tokenizer (LlamaTokenizer) 89 | tokenizer_ckpt = model.config.lm_config.pretrained_tokenizer_name_or_path 90 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt, use_fast=False) 91 | if tokenizer.pad_token is None: 92 | tokenizer.pad_token = tokenizer.unk_token 93 | processor = HoneybeeProcessor( 94 | image_processor, tokenizer, num_visual_token=num_visual_tokens 95 | ) 96 | 97 | return model, tokenizer, processor 98 | 99 | 100 | def do_generate( 101 | prompts, image_list, model, tokenizer, processor, use_bf16=False, **generate_kwargs 102 | ): 103 | """The interface for generation 104 | 105 | Args: 106 | prompts (List[str]): The prompt text 107 | image_list (List[str]): Paths of images 108 | model (HoneybeeForConditionalGeneration): HoneybeeForConditionalGeneration 109 | tokenizer (AutoTokenizer): AutoTokenizer 110 | processor (HoneybeeProcessor): HoneybeeProcessor 111 | use_bf16 (bool, optional): Whether to use bfloat16. Defaults to False. 112 | 113 | Returns: 114 | sentence (str): Generated sentence. 115 | """ 116 | if image_list: 117 | images = [Image.open(_) for _ in image_list] 118 | else: 119 | images = None 120 | inputs = processor(text=prompts, images=images) 121 | inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()} 122 | inputs = {k: v.to(model.device) for k, v in inputs.items()} 123 | with torch.no_grad(): 124 | res = model.generate(**inputs, **generate_kwargs) 125 | sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True) 126 | return sentence 127 | -------------------------------------------------------------------------------- /qh360_vl/model/multimodal_projector/projectors.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | import torch.nn as nn 5 | from einops import rearrange 6 | from timm.models.layers import LayerNorm, LayerNorm2d 7 | from timm.models.regnet import RegStage 8 | 9 | 10 | from .configuration_honeybee import HoneybeeVisualProjectorConfig 11 | from torch.nn import functional as F 12 | import math 13 | 14 | def build_pos_embeds( 15 | config: HoneybeeVisualProjectorConfig, num_input_tokens: int, vision_hidden_size: int 16 | ): 17 | # pos emb 18 | # true 19 | if config.pos_emb: 20 | pos_emb = torch.nn.Parameter(torch.zeros(1, num_input_tokens, vision_hidden_size)) 21 | nn.init.trunc_normal_(pos_emb, mean=0.0, std=0.02) 22 | else: 23 | pos_emb = None 24 | 25 | return pos_emb 26 | 27 | 28 | def build_eos_tokens(config: HoneybeeVisualProjectorConfig, output_hidden_size: int): 29 | # think tokens 30 | num_eos_tokens = config.num_eos_tokens 31 | # 0 32 | if num_eos_tokens: 33 | eos_tokens = torch.nn.Parameter(torch.randn(1, num_eos_tokens, output_hidden_size)) 34 | nn.init.trunc_normal_(eos_tokens, mean=0.0, std=config.initializer_range) 35 | else: 36 | eos_tokens = None 37 | 38 | return eos_tokens 39 | 40 | 41 | def build_prenorm(config: HoneybeeVisualProjectorConfig): 42 | # false 43 | if config.prenorm: 44 | prenorm = LayerNorm(config.encoder_hidden_size) 45 | else: 46 | prenorm = None 47 | return prenorm 48 | 49 | 50 | def build_mlp(depth, hidden_size, output_hidden_size): 51 | layers = [nn.Linear(hidden_size, output_hidden_size)] 52 | for _ in range(1, depth): 53 | layers.append(nn.SiLU()) 54 | layers.append(nn.Linear(output_hidden_size, output_hidden_size)) 55 | return nn.Sequential(*layers) 56 | 57 | def get_abs_pos(abs_pos, tgt_size): 58 | # abs_pos: L, C 59 | # tgt_size: M 60 | # return: M, C 61 | # 16,24 62 | src_size = int(math.sqrt(abs_pos.size(1))) 63 | # 32,48 64 | tgt_size = int(math.sqrt(tgt_size)) 65 | dtype = abs_pos.dtype 66 | 67 | if src_size != tgt_size: 68 | return F.interpolate( 69 | abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), 70 | size=(tgt_size, tgt_size), 71 | mode="bicubic", 72 | align_corners=False, 73 | ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype) 74 | else: 75 | return abs_pos 76 | 77 | 78 | class Projector(nn.Module): 79 | """Base projector class""" 80 | 81 | def __init__( 82 | self, 83 | config: HoneybeeVisualProjectorConfig, 84 | num_input_tokens: int, 85 | output_hidden_size: int, 86 | ): 87 | super().__init__() 88 | self.config = config 89 | self.num_input_tokens = num_input_tokens 90 | self.output_hidden_size = output_hidden_size 91 | 92 | # think tokens 93 | self.eos_tokens = build_eos_tokens(config, output_hidden_size) 94 | 95 | # pos emb 96 | self.pos_emb = build_pos_embeds(config, num_input_tokens, config.encoder_hidden_size) 97 | 98 | self.prenorm = build_prenorm(config) 99 | 100 | self.build_net() 101 | 102 | def build_net(self): 103 | raise NotImplementedError() 104 | 105 | def _forward(self, x): 106 | raise NotImplementedError() 107 | 108 | def forward(self, x: torch.Tensor) -> torch.Tensor: 109 | """ 110 | Args: 111 | x: (B, L, encoder_hidden_size) tensor from the visual backbone (CLIP visual encoder), including cls token. 112 | """ 113 | if self.prenorm is not None: 114 | x = self.prenorm(x) 115 | 116 | if self.pos_emb is not None: 117 | # self.pos_emb = self.pos_emb[:,1:] 118 | pos_emb = get_abs_pos(self.pos_emb[:,1:], x.size(1)) 119 | pos_emb = pos_emb.to(device=x.device) 120 | x += pos_emb 121 | 122 | x = self._forward(x) # (B, L, output_hidden_size) 123 | 124 | B = x.size(0) 125 | if self.eos_tokens is not None: 126 | x = torch.cat([x, self.eos_tokens.expand(B, -1, -1)], dim=1) 127 | return x 128 | 129 | 130 | class ConvProjector(Projector): 131 | def _forward(self, x): 132 | # x: [B, L, dim] 133 | # x = x[:, 1:] # drop cls token and 2d forward 134 | 135 | hw = int(x.size(1) ** 0.5) 136 | x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw) 137 | x = self.net(x) 138 | x = rearrange(x, "b d h w -> b (h w) d") 139 | x = self.readout(x) 140 | 141 | return x 142 | 143 | 144 | class CAbstractor(ConvProjector): 145 | """C-Abstractor""" 146 | def build_net(self): 147 | encoder_hidden_size = self.config.encoder_hidden_size 148 | hidden_size = self.config.hidden_size 149 | output_hidden_size = self.output_hidden_size 150 | depth = self.config.depth 151 | mlp_depth = self.config.mlp_depth 152 | 153 | n_queries = self.config.num_queries 154 | assert (n_queries ** 0.5).is_integer(), "n_queries must be square number" 155 | hw = int(n_queries ** 0.5) 156 | 157 | # RegBlock = ResBlock + SE 158 | RegBlock = partial( 159 | RegStage, 160 | stride=1, 161 | dilation=1, 162 | act_layer=nn.SiLU, 163 | norm_layer=LayerNorm2d, 164 | ) 165 | 166 | s1 = RegBlock( 167 | depth, 168 | encoder_hidden_size, 169 | hidden_size, 170 | ) 171 | sampler = nn.AdaptiveAvgPool2d((hw, hw)) 172 | s2 = RegBlock( 173 | depth, 174 | hidden_size, 175 | hidden_size, 176 | ) 177 | 178 | self.net = nn.Sequential(s1, sampler, s2) 179 | self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size) 180 | 181 | -------------------------------------------------------------------------------- /qh360_vl/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /qh360_vl/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/__init__.py -------------------------------------------------------------------------------- /qh360_vl/serve/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 5 | from qh360_vl.conversation import conv_templates, SeparatorStyle 6 | from qh360_vl.model.builder import load_pretrained_model 7 | from qh360_vl.utils import disable_torch_init 8 | from qh360_vl.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 9 | 10 | from PIL import Image 11 | 12 | import requests 13 | from PIL import Image 14 | from io import BytesIO 15 | from transformers import TextStreamer 16 | 17 | 18 | def load_image(image_file): 19 | if image_file.startswith('http://') or image_file.startswith('https://'): 20 | response = requests.get(image_file) 21 | image = Image.open(BytesIO(response.content)).convert('RGB') 22 | else: 23 | image = Image.open(image_file).convert('RGB') 24 | return image 25 | 26 | 27 | def main(args): 28 | # Model 29 | disable_torch_init() 30 | 31 | model_name = get_model_name_from_path(args.model_path) 32 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device) 33 | 34 | if 'llama-2' in model_name.lower(): 35 | conv_mode = "llava_llama_2" 36 | elif "v1" in model_name.lower(): 37 | conv_mode = "llava_v1" 38 | elif "mpt" in model_name.lower(): 39 | conv_mode = "mpt" 40 | else: 41 | conv_mode = "llava_v0" 42 | 43 | if args.conv_mode is not None and conv_mode != args.conv_mode: 44 | print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) 45 | else: 46 | args.conv_mode = conv_mode 47 | 48 | conv = conv_templates[args.conv_mode].copy() 49 | if "mpt" in model_name.lower(): 50 | roles = ('user', 'assistant') 51 | else: 52 | roles = conv.roles 53 | 54 | image = load_image(args.image_file) 55 | # Similar operation in model_worker.py 56 | image_tensor = process_images([image], image_processor, args) 57 | if type(image_tensor) is list: 58 | image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor] 59 | else: 60 | image_tensor = image_tensor.to(model.device, dtype=torch.float16) 61 | 62 | while True: 63 | try: 64 | inp = input(f"{roles[0]}: ") 65 | except EOFError: 66 | inp = "" 67 | if not inp: 68 | print("exit...") 69 | break 70 | 71 | print(f"{roles[1]}: ", end="") 72 | 73 | if image is not None: 74 | # first message 75 | if model.config.mm_use_im_start_end: 76 | inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp 77 | else: 78 | inp = DEFAULT_IMAGE_TOKEN + '\n' + inp 79 | conv.append_message(conv.roles[0], inp) 80 | image = None 81 | else: 82 | # later messages 83 | conv.append_message(conv.roles[0], inp) 84 | conv.append_message(conv.roles[1], None) 85 | prompt = conv.get_prompt() 86 | 87 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 88 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 89 | keywords = [stop_str] 90 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 91 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 92 | 93 | with torch.inference_mode(): 94 | output_ids = model.generate( 95 | input_ids, 96 | images=image_tensor, 97 | do_sample=True, 98 | temperature=args.temperature, 99 | max_new_tokens=args.max_new_tokens, 100 | streamer=streamer, 101 | use_cache=True, 102 | stopping_criteria=[stopping_criteria]) 103 | 104 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() 105 | conv.messages[-1][-1] = outputs 106 | 107 | if args.debug: 108 | print("\n", {"prompt": prompt, "outputs": outputs}, "\n") 109 | 110 | 111 | if __name__ == "__main__": 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 114 | parser.add_argument("--model-base", type=str, default=None) 115 | parser.add_argument("--image-file", type=str, required=True) 116 | parser.add_argument("--device", type=str, default="cuda") 117 | parser.add_argument("--conv-mode", type=str, default=None) 118 | parser.add_argument("--temperature", type=float, default=0.2) 119 | parser.add_argument("--max-new-tokens", type=int, default=512) 120 | parser.add_argument("--load-8bit", action="store_true") 121 | parser.add_argument("--load-4bit", action="store_true") 122 | parser.add_argument("--debug", action="store_true") 123 | parser.add_argument("--image-aspect-ratio", type=str, default='pad') 124 | args = parser.parse_args() 125 | main(args) 126 | -------------------------------------------------------------------------------- /qh360_vl/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /qh360_vl/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /qh360_vl/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /qh360_vl/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from qh360_vl.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /qh360_vl/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from qh360_vl.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True) 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /scripts/eval/custom_vqa.sh: -------------------------------------------------------------------------------- 1 | INIT_MODEL_PATH="/hbox2dir" 2 | 3 | name="qh360_vl-llama3-70B" 4 | 5 | python -m qh360_vl.eval.model_vqa_loader_llama3_nodist \ 6 | --model-path $INIT_MODEL_PATH/$name \ 7 | --question-file custom/vqa_test_custom.jsonl \ 8 | --image-folder custom/vqa \ 9 | --answers-file custom/$name.jsonl \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --conv-mode llama3 13 | -------------------------------------------------------------------------------- /scripts/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="qh360_vl-8B" 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | INIT_MODEL_PATH="/hbox2dir" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m qh360_vl.eval.model_vqa_loader_raw \ 15 | --model-path $INIT_MODEL_PATH/$CKPT \ 16 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 17 | --image-folder ./playground/data/eval/gqa/data/images \ 18 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --slide_window \ 23 | --conv-mode llama3 & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 39 | 40 | cd $GQADIR 41 | python eval/eval.py --tier testdev_balanced -------------------------------------------------------------------------------- /scripts/eval/infer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | python -m qh360_vl.eval.infer \ 6 | --model-path $INIT_MODEL_PATH/$CKPT \ 7 | --image-path /hbox2dir/test.jpg \ 8 | --slide_window 9 | -------------------------------------------------------------------------------- /scripts/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | 4 | name="qh360_vl-llama3-70B" 5 | python -m qh360_vl.eval.model_vqa \ 6 | --model-path $INIT_MODEL_PATH/$name \ 7 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 8 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 9 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/$name.jsonl \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --conv-mode llama3 13 | 14 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 15 | 16 | python qh360_vl/eval/eval_gpt_review_bench.py \ 17 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 18 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 19 | --rule llava/eval/table/rule.json \ 20 | --answer-list \ 21 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 22 | playground/data/eval/llava-bench-in-the-wild/answers/$name.jsonl \ 23 | --output \ 24 | playground/data/eval/llava-bench-in-the-wild/reviews/$name.jsonl 25 | 26 | python qh360_vl/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/$name.jsonl -------------------------------------------------------------------------------- /scripts/eval/mmb_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | for SPLIT in {"mmbench_dev_cn_20231003","mmbench_test_cn_20231003",} 6 | do 7 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mmbench_llama3 \ 8 | --model-path $INIT_MODEL_PATH/$CKPT \ 9 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 10 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 11 | --single-pred-prompt \ 12 | --temperature 0 \ 13 | --slide_window \ 14 | --lang cn \ 15 | --conv-mode llama3 \ 16 | 17 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 18 | 19 | python scripts/convert_mmbench_for_submission.py \ 20 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 21 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 22 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 23 | --experiment $CKPT 24 | done -------------------------------------------------------------------------------- /scripts/eval/mmb_en.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | for SPLIT in {"mmbench_dev_en_20231003","mmbench_test_en_20231003",} 6 | do 7 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mmbench_llama3 \ 8 | --model-path $INIT_MODEL_PATH/$CKPT \ 9 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 10 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 11 | --single-pred-prompt \ 12 | --temperature 0 \ 13 | --slide_window \ 14 | --lang en \ 15 | --conv-mode llama3 \ 16 | 17 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 18 | 19 | python scripts/convert_mmbench_for_submission.py \ 20 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 21 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 22 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 23 | --experiment $CKPT 24 | done -------------------------------------------------------------------------------- /scripts/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mme_llama3 \ 6 | --model-path $INIT_MODEL_PATH/$CKPT \ 7 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 8 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 9 | --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --conv-mode llama3 13 | 14 | cd ./playground/data/eval/MME 15 | python convert_answer_to_mme.py --experiment $CKPT 16 | 17 | cd eval_tool 18 | python calculation.py --results_dir answers/$CKPT -------------------------------------------------------------------------------- /scripts/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | for SPLIT in {"validation","test",} 6 | do 7 | python -m qh360_vl.eval.model_vqa_mmmu \ 8 | --model-path $INIT_MODEL_PATH/$CKPT \ 9 | --data-path ./playground/data/eval/mmmu/MMMU \ 10 | --config-path ./playground/data/eval/mmmu/config.yaml \ 11 | --output-path ./playground/data/eval/mmmu/answers_upload/$SPLIT/$CKPT.json \ 12 | --split $SPLIT \ 13 | --slide_window \ 14 | --conv-mode llama3 15 | 16 | if [[ $SPLIT == "validation" ]] 17 | then 18 | python ./playground/data/eval/mmmu/eval_mmmu.py \ 19 | --output-path ./playground/data/eval/mmmu/answers_upload/$SPLIT/$CKPT.json 20 | fi 21 | done -------------------------------------------------------------------------------- /scripts/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_pope_llama3 \ 6 | --model-path $INIT_MODEL_PATH/$CKPT \ 7 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 8 | --image-folder ./playground/data/eval/pope/val2014 \ 9 | --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --conv-mode llama3 13 | 14 | python qh360_vl/eval/eval_pope.py \ 15 | --annotation-dir ./playground/data/eval/pope/coco \ 16 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 17 | --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl -------------------------------------------------------------------------------- /scripts/eval/refcoco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_refcoco_llama3 \ 6 | --model-path $INIT_MODEL_PATH/$CKPT \ 7 | --question-file ./playground/data/eval/refcoco/REFCOCO_VAL_en_new.jsonl \ 8 | --image-folder ./playground/data/eval/refcoco/train2014 \ 9 | --answers-file ./playground/data/eval/res_test/$CKPT/refcoco.json \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --patch_img_size 336 \ 13 | --conv-mode llama3 \ 14 | 15 | python ./qh360_vl/eval/compute_precision.py ./playground/data/eval/res_test/$CKPT/refcoco.json -------------------------------------------------------------------------------- /scripts/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INIT_MODEL_PATH="/hbox2dir" 3 | CKPT="qh360_vl-8B" 4 | 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_textvqa_llama3 \ 6 | --model-path $INIT_MODEL_PATH/$CKPT \ 7 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 8 | --image-folder ./playground/data/eval/textvqa/train_images \ 9 | --answers-file ./playground/data/eval/textvqa/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --slide_window \ 12 | --conv-mode llama3 13 | 14 | python -m qh360_vl.eval.eval_textvqa \ 15 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 16 | --result-file ./playground/data/eval/textvqa/answers/$CKPT.jsonl -------------------------------------------------------------------------------- /scripts/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="qh360_vl-8B" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | INIT_MODEL_PATH="/hbox2dir" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m qh360_vl.eval.model_vqa_loader_raw \ 14 | --model-path $INIT_MODEL_PATH/$CKPT \ 15 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/vqav2/test2015 \ 17 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --slide_window \ 22 | --conv-mode llama3 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | --------------------------------------------------------------------------------