├── .gitignore
├── LICENSE
├── README.md
├── deploy.sh
├── docs
    ├── 008.jpg
    └── Evaluation.md
├── pyproject.toml
├── qh360_vl
    ├── 360vl.PNG
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── compute_precision.py
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── infer.py
    │   ├── m4c_evaluator.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader_llama3.py
    │   ├── model_vqa_loader_llama3_nodist.py
    │   ├── model_vqa_loader_raw.py
    │   ├── model_vqa_mmbench_llama3.py
    │   ├── model_vqa_mme_llama3.py
    │   ├── model_vqa_mmmu.py
    │   ├── model_vqa_pope_llama3.py
    │   ├── model_vqa_refcoco_llama3.py
    │   ├── model_vqa_textvqa_llama3.py
    │   └── summarize_gpt_review.py
    ├── mm_utils.py
    ├── model
    │   ├── QH360_VL_arch_cc.py
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── language_model
    │   │   └── QH360_VL_llama.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   ├── builder.py
    │   │   ├── configuration_honeybee.py
    │   │   ├── pipeline
    │   │   │   ├── config.py
    │   │   │   ├── data_utils
    │   │   │   │   ├── special_tokens.py
    │   │   │   │   └── utils.py
    │   │   │   └── interface.py
    │   │   └── projectors.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   └── test_message.py
    └── utils.py
└── scripts
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── eval
        ├── custom_vqa.sh
        ├── gqa.sh
        ├── infer.sh
        ├── llavabench.sh
        ├── mmb_cn.sh
        ├── mmb_en.sh
        ├── mme.sh
        ├── mmmu.sh
        ├── pope.sh
        ├── refcoco.sh
        ├── textvqa.sh
        └── vqav2.sh
    └── extract_mm_projector.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | *.json
11 | *.jsonl
12 | 
13 | # Data
14 | !**/alpaca-data-conversation.json
15 | 
16 | # Editor
17 | .idea
18 | *.swp
19 | 
20 | # Other
21 | .DS_Store
22 | wandb
23 | output
24 | 
25 | # Dir
26 | .ipynb_checkpoints/
27 | __pycache__/
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 360VL
  2 | 
  3 | <p align="center">
  4 |   <img src="./qh360_vl/360vl.PNG" width=80%/>
  5 | </p>
  6 | 
  7 | **360VL** is developed based on the LLama3 language model and is also the industry's first open source large multi-modal model based on **LLama3-70B**[[🤗Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)]. In addition to applying the Llama3 language model, the 360VL model also designs a globally aware multi-branch projector architecture, which enables the model to have more sufficient image understanding capabilities.
  8 | 
  9 | 
 10 | ## Contents
 11 | - [Install](#install)
 12 | - [Model Zoo](#llava-weights)
 13 | - [Demo](#Demo)
 14 | - [Evaluation](#evaluation)
 15 | 
 16 | ## Install
 17 | 
 18 | 1. Clone this repository and navigate to 360VL folder
 19 | ```bash
 20 | git clone https://github.com/360CVGroup/360VL.git
 21 | cd 360VL
 22 | ```
 23 | 
 24 | 2. Install Package
 25 | ```Shell
 26 | conda create -n qh360_vl python=3.10 -y
 27 | conda activate qh360_vl
 28 | bash deploy.sh
 29 | ```
 30 | 
 31 | ## Model Zoo
 32 | | Model               | Checkpoints   | MMB<sub>T  | MMB<sub>D|MMB-CN<sub>T  | MMB-CN<sub>D|MMMU<sub>V|MMMU<sub>T| MME |
 33 | |:--------------------|:------------:|:----:|:------:|:------:|:-------:|:-------:|:-------:|:-------:|
 34 | | QWen-VL-Chat |  [🤗LINK](https://huggingface.co/Qwen/Qwen-VL-Chat) | 61.8 | 60.6 |  56.3  |  56.7  |37| 32.9  | 1860 |
 35 | | mPLUG-Owl2 |  [🤖LINK](https://www.modelscope.cn/models/iic/mPLUG-Owl2/summary) | 66.0 | 66.5 |  60.3  |  59.5  |34.7| 32.1  | 1786.4 |
 36 | | CogVLM |  [🤗LINK](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf) | 65.8| 63.7 | 55.9  | 53.8    |37.3| 30.1 | 1736.6|
 37 | | Monkey-Chat |  [🤗LINK](https://huggingface.co/echo840/Monkey-Chat) | 72.4| 71 | 67.5  | 65.8    |40.7| - | 1887.4|
 38 | | MM1-7B-Chat |  [LINK](https://ar5iv.labs.arxiv.org/html/2403.09611) | -| 72.3 | -  | -    |37.0| 35.6 |  1858.2|
 39 | | IDEFICS2-8B |  [🤗LINK](https://huggingface.co/HuggingFaceM4/idefics2-8b) | 75.7 | 75.3 | 68.6  | 67.3    |43.0| 37.7 |1847.6| 
 40 | | SVIT-v1.5-13B|  [🤗LINK](https://huggingface.co/Isaachhe/svit-v1.5-13b-full) | 69.1 | - | 63.1  |  -  | 38.0| 33.3|1889| 
 41 | | LLaVA-v1.5-13B |  [🤗LINK](https://huggingface.co/liuhaotian/llava-v1.5-13b) | 69.2 | 69.2 | 65  | 63.6    |36.4| 33.6 | 1826.7|  
 42 | | LLaVA-v1.6-13B |  [🤗LINK](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-13b) | 70 | 70.7 | 68.5  | 64.3    |36.2| - |1901|
 43 | | Honeybee |  [LINK](https://github.com/kakaobrain/honeybee) | 73.6 | 74.3 | -  | -    |36.2|  -|1976.5|
 44 | | YI-VL-34B |  [🤗LINK](https://huggingface.co/01-ai/Yi-VL-34B) | 72.4 | 71.1 |  70.7 |   71.4  |45.1| 41.6 |2050.2|
 45 | | **360VL-8B** |  [🤗LINK](https://huggingface.co/qihoo360/360VL-8B) | 75.3 | 73.7 | 71.1   | 68.6    |39.7| 37.1 |  1944.6|
 46 | | **360VL-70B** |  [🤗LINK](https://huggingface.co/qihoo360/360VL-70B) | 78.1 | 80.4 | 76.9   | 77.7    |50.8| 44.3 |  2012.3|
 47 | 
 48 | 
 49 | ## Quick Start 🤗
 50 | 
 51 | ```Shell
 52 | from transformers import AutoModelForCausalLM, AutoTokenizer
 53 | import torch
 54 | from PIL import Image
 55 | 
 56 | checkpoint = "qihoo360/360VL-70B"
 57 | 
 58 | model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True).eval()
 59 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
 60 | vision_tower = model.get_vision_tower()
 61 | vision_tower.load_model()
 62 | vision_tower.to(device="cuda", dtype=torch.float16)
 63 | image_processor = vision_tower.image_processor
 64 | tokenizer.pad_token = tokenizer.eos_token
 65 | 
 66 | 
 67 | image = Image.open("docs/008.jpg").convert('RGB')
 68 | query = "Who is this cartoon character?"
 69 | terminators = [
 70 |     tokenizer.convert_tokens_to_ids("<|eot_id|>",)
 71 | ]
 72 | 
 73 | inputs = model.build_conversation_input_ids(tokenizer, query=query, image=image, image_processor=image_processor)
 74 | 
 75 | input_ids = inputs["input_ids"].to(device='cuda', non_blocking=True)
 76 | images = inputs["image"].to(dtype=torch.float16, device='cuda', non_blocking=True)
 77 | 
 78 | output_ids = model.generate(
 79 |     input_ids,
 80 |     images=images,
 81 |     do_sample=False,
 82 |     eos_token_id=terminators,
 83 |     num_beams=1,
 84 |     max_new_tokens=512,
 85 |     use_cache=True)
 86 | 
 87 | input_token_len = input_ids.shape[1]
 88 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 89 | outputs = outputs.strip()
 90 | print(outputs)
 91 | ```
 92 | 
 93 | ## Demo
 94 | 
 95 | To run our demo, you need to download the weights of 360VL[🤗LINK](https://huggingface.co/qihoo360/360VL-70B) and the weights of CLIP-ViT-336[🤗LINK](https://huggingface.co/openai/clip-vit-large-patch14-336)
 96 | 
 97 | ### Gradio Web UI
 98 | 
 99 | To launch a Gradio demo locally, please run the following commands one by one. If you plan to launch multiple model workers to compare between different checkpoints, you only need to launch the controller and the web server *ONCE*.
100 | 
101 | #### Launch a controller
102 | ```Shell
103 | python -m qh360_vl.serve.controller --host 0.0.0.0 --port 10000
104 | ```
105 | 
106 | #### Launch a gradio web server.
107 | ```Shell
108 | python -m qh360_vl.serve.gradio_web_server --controller http://localhost:10000 --model-list-mode reload
109 | ```
110 | You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker.
111 | 
112 | #### Launch a model worker
113 | 
114 | This is the actual *worker* that performs the inference on the GPU.  Each worker is responsible for a single model specified in `--model-path`.
115 | 
116 | Note that the 8B model supports single-card inference, but the 70B model requires 8-card inference.
117 | 
118 | ```Shell
119 | CUDA_VISIBLE_DEVICES=0 python -m qh360_vl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path qihoo360/360VL-8B
120 | ```
121 | 
122 | ```Shell
123 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m qh360_vl.serve.model_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --model-path qihoo360/360VL-70B
124 | ```
125 | 
126 | ### CLI Inference
127 | 
128 | Chat about images using 360VL without the need of Gradio interface. 
129 | 
130 | ```Shell
131 | INIT_MODEL_PATH="/hbox2dir"
132 | name="360VL-8B"
133 | python -m qh360_vl.eval.infer \
134 |     --model-path $INIT_MODEL_PATH/$name \
135 | ```
136 | 
137 | 
138 | ### Download Llama3 checkpoints (Non-essential)
139 | 
140 | 360VL is developed based on Llama 3. If you have needs, please download the weights yourself.
141 | 
142 | [[🤗Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)]
143 | [[🤗Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)]
144 | 
145 | ## Evaluation
146 | We refer to the evaluation data organization method of LLava-1.5, which can be found in the following documents.  [Evaluation.md](docs/Evaluation.md)
147 | 
148 | ```Shell
149 | bash scripts/eval/mme.sh
150 | bash scripts/eval/mmb_cn.sh
151 | bash scripts/eval/mmb_en.sh
152 | bash scripts/eval/refcoco.sh
153 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./scripts/eval/gqa.sh
154 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./scripts/eval/vqav2.sh
155 | bash scripts/eval/llavabench.sh
156 | bash scripts/eval/mmmu.sh
157 | bash scripts/eval/pope.sh
158 | bash scripts/eval/textvqa.sh
159 | ```
160 | 
161 | <!-- ## Acknowledgement -->
162 | 
163 | ## License
164 | 
165 | This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses.
166 | The content of this project itself is licensed under the [Apache license 2.0](./LICENSE).
167 | 
168 | ## Related Projects
169 | This work wouldn't be possible without the incredible open-source code of these projects. Huge thanks!
170 | - [Meta Llama 3](https://github.com/meta-llama/llama3)
171 | - [LLaVA: Large Language and Vision Assistant](https://github.com/haotian-liu/LLaVA)
172 | - [Honeybee: Locality-enhanced Projector for Multimodal LLM](https://github.com/kakaobrain/honeybee)
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | # pip config set global.index-url http://mirrors.cloud.tencent.com/pypi/simple
2 | # pip config set global.trusted-host mirrors.cloud.tencent.com
3 | pip install --upgrade pip  # enable PEP 660 support
4 | 
5 | pip install -e .
6 | 
7 | pip install ninja
8 | pip install flash-attn --no-build-isolation
9 | 


--------------------------------------------------------------------------------
/docs/008.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/docs/008.jpg


--------------------------------------------------------------------------------
/docs/Evaluation.md:
--------------------------------------------------------------------------------
  1 | # Evaluation
  2 | 
  3 | In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs.
  4 | 
  5 | Currently, we mostly utilize the official toolkit or server for the evaluation.
  6 | 
  7 | ## Evaluate on Custom Datasets
  8 | 
  9 | You can evaluate LLaVA on your custom datasets by converting your dataset to LLaVA's jsonl format, and evaluate using [`model_vqa.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/model_vqa.py).
 10 | 
 11 | Below we provide a general guideline for evaluating datasets with some common formats.
 12 | 
 13 | 1. Short-answer (e.g. VQAv2, MME).
 14 | 
 15 | ```
 16 | <question>
 17 | Answer the question using a single word or phrase.
 18 | ```
 19 | 
 20 | 2. Option-only for multiple-choice (e.g. MMBench, SEED-Bench).
 21 | 
 22 | ```
 23 | <question>
 24 | A. <option_1>
 25 | B. <option_2>
 26 | C. <option_3>
 27 | D. <option_4>
 28 | Answer with the option's letter from the given choices directly.
 29 | ```
 30 | 
 31 | 3. Natural QA (e.g. LLaVA-Bench, MM-Vet).
 32 | 
 33 | No postprocessing is needed.
 34 | 
 35 | ## Scripts
 36 | 
 37 | Before preparing task-specific data, **you MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `./playground/data/eval`. This also provides a general structure for all datasets.
 38 | 
 39 | ### VQAv2
 40 | 
 41 | 1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `./playground/data/eval/vqav2`.
 42 | 2. Multi-GPU inference.
 43 | ```Shell
 44 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
 45 | ```
 46 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.
 47 | 
 48 | ### GQA
 49 | 
 50 | 1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
 51 | 2. Multi-GPU inference.
 52 | ```Shell
 53 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
 54 | ```
 55 | 
 56 | ### VisWiz
 57 | 
 58 | 1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
 59 | 2. Single-GPU inference.
 60 | ```Shell
 61 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
 62 | ```
 63 | 3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.
 64 | 
 65 | ### ScienceQA
 66 | 
 67 | 1. Under `./playground/data/eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA).
 68 | 2. Single-GPU inference and evaluate.
 69 | ```Shell
 70 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh
 71 | ```
 72 | 
 73 | ### TextVQA
 74 | 
 75 | 1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `./playground/data/eval/textvqa`.
 76 | 2. Single-GPU inference and evaluate.
 77 | ```Shell
 78 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
 79 | ```
 80 | 
 81 | ### POPE
 82 | 
 83 | 1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
 84 | 2. Single-GPU inference and evaluate.
 85 | ```Shell
 86 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
 87 | ```
 88 | 
 89 | ### MME
 90 | 
 91 | 1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
 92 | 2. Downloaded images to `MME_Benchmark_release_version`.
 93 | 3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
 94 | 4. Single-GPU inference and evaluate.
 95 | ```Shell
 96 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
 97 | ```
 98 | 
 99 | ### MMBench
100 | 
101 | 1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
102 | 2. Single-GPU inference.
103 | ```Shell
104 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
105 | ```
106 | 3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.
107 | 
108 | ### MMBench-CN
109 | 
110 | 1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `./playground/data/eval/mmbench`.
111 | 2. Single-GPU inference.
112 | ```Shell
113 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench_cn.sh
114 | ```
115 | 3. Submit the results to the evaluation server: `./playground/data/eval/mmbench/answers_upload/mmbench_dev_cn_20231003`.
116 | 
117 | 
118 | ### SEED-Bench
119 | 
120 | 1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `./playground/data/eval/seed_bench/SEED-Bench-image`.
121 | 2. Extract the video frame in the middle from the downloaded videos, and put them under `./playground/data/eval/seed_bench/SEED-Bench-video-image`. We provide our script `extract_video_frames.py` modified from the official one.
122 | 3. Multiple-GPU inference and evaluate.
123 | ```Shell
124 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/seed.sh
125 | ```
126 | 4. Optionally, submit the results to the leaderboard: `./playground/data/eval/seed_bench/answers_upload` using the official jupyter notebook.
127 | 
128 | ### LLaVA-Bench-in-the-Wild
129 | 
130 | 1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `./playground/data/eval/llava-bench-in-the-wild`.
131 | 2. Single-GPU inference and evaluate.
132 | ```Shell
133 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/llavabench.sh
134 | ```
135 | 
136 | ### MM-Vet
137 | 
138 | 1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
139 | 2. Single-GPU inference.
140 | ```Shell
141 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
142 | ```
143 | 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
144 | 
145 | ## More Benchmarks
146 | 
147 | Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.
148 | 
149 | ### Q-Bench
150 | 
151 | 1. Download [`llvisionqa_dev.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_dev.json) (for `dev`-subset) and [`llvisionqa_test.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_test.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
152 | 2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
153 | 3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
154 | ```Shell
155 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench.sh dev
156 | ```
157 | 4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_dev_answers.jsonl`.
158 | 
159 | ### Chinese-Q-Bench
160 | 
161 | 1. Download [`质衡-问答-验证集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E9%AA%8C%E8%AF%81%E9%9B%86.json) (for `dev`-subset) and [`质衡-问答-测试集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E6%B5%8B%E8%AF%95%E9%9B%86.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`. 
162 | 2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
163 | 3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
164 | ```Shell
165 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench_zh.sh dev
166 | ```
167 | 4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_zh_dev_answers.jsonl`.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "qh360_vl"
 7 | version = "1.0.0"
 8 | description = "Towards GPT-4 like large language and visual assistant."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "einops", "fastapi", "gradio==3.35.2", "markdown2[all]", "numpy",
17 |     "requests", "sentencepiece", "tokenizers>=0.12.1",
18 |     "uvicorn", "wandb",
19 |     "shortuuid", "httpx==0.24.0",
20 |     "deepspeed==0.9.5",
21 |     "peft==0.4.0",
22 |     "transformers==4.37.2",
23 |     "accelerate==0.29.3",
24 |     "bitsandbytes==0.41.0",
25 |     "scikit-learn==1.2.2",
26 |     "sentencepiece==0.1.99",
27 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
28 |     "gradio_client==0.2.9"
29 | ]
30 | 
31 | [project.urls]
32 | "Homepage" = "https://github.com/360CVGroup/360VL"
33 | "Bug Tracker" = "https://github.com/360CVGroup/360VL/issues"
34 | 
35 | [tool.setuptools.packages.find]
36 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
37 | 
38 | [tool.wheel]
39 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]


--------------------------------------------------------------------------------
/qh360_vl/360vl.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/360vl.PNG


--------------------------------------------------------------------------------
/qh360_vl/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import QH360_VL_LlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/qh360_vl/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/compute_precision.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | 
 4 | 
 5 | def ori_bbox(nor_bbox, img_size):
 6 |     w = img_size[0]
 7 |     h = img_size[1]
 8 |     if w>h:
 9 |         big = w
10 |         border_w = 0
11 |         border_h = (w-h)//2
12 |     elif w<h:
13 |         big = h
14 |         border_w = (h-w) // 2
15 |         border_h = 0
16 |     elif w==h:
17 |         big = h
18 |         border_w = 0
19 |         border_h = 0
20 |     x1 = nor_bbox[0] * big - border_w
21 |     y1 = nor_bbox[1] * big - border_h
22 |     x2 = nor_bbox[2] * big - border_w
23 |     y2 = nor_bbox[3] * big - border_h
24 |     return [x1, y1, x2, y2]
25 | 
26 | 
27 | def compute_iou(bbox1, bbox2):
28 |     """
29 |     computing IoU
30 |     :param bbox1: (x0, y0, x1, y1)
31 |     :param bbox2: (x0, y0, x1, y1)
32 |     :return: scala value of IoU
33 |     """
34 |     # computing area of each rectangles
35 |     S_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
36 |     S_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
37 | 
38 |     # computing the sum_area
39 |     sum_area = S_bbox1 + S_bbox2
40 | 
41 |     # find the each edge of intersect rectangle
42 |     left_line = max(bbox1[0], bbox2[0])
43 |     right_line = min(bbox1[2], bbox2[2])
44 |     top_line = max(bbox1[1], bbox2[1])
45 |     bottom_line = min(bbox1[3], bbox2[3])
46 | 
47 |     # judge if there is an intersect
48 |     if left_line >= right_line or top_line >= bottom_line:
49 |         return 0
50 |     else:
51 |         intersect = (right_line - left_line) * (bottom_line - top_line)
52 |         return intersect / (sum_area - intersect) * 1.0
53 | 
54 | 
55 | gt_info = {}
56 | with open('./playground/data/eval/refcoco/REFCOCO_VAL_en_new.jsonl', "r") as f:
57 |     for line in tqdm(f):
58 |         info = json.loads(line)
59 |         gt_info[info['sent_id']] = {'bbox': info['bbox'], 'height': info['height'], 'width': info['width']}
60 | 
61 | 
62 | import sys
63 | with open(sys.argv[1], "r") as f:
64 |     iou_thresh = 0.5
65 |     tp = 0
66 |     fp = 0
67 |     for line in tqdm(f):
68 |         info = json.loads(line)
69 |         idx = info['question_id']
70 |         pred = info['text']
71 |         try:
72 |             gt = gt_info[idx]
73 |             gt_bbox = gt['bbox']
74 |     #         print('gt:',gt_bbox)
75 | 
76 |             pred_bboxs = pred.split('; ')
77 |             num_bboxs = len(pred_bboxs)
78 |             for i, pred_bbox in enumerate(pred_bboxs):
79 |                 pred_bbox = eval(pred_bbox)
80 | 
81 |                 pred_bbox = ori_bbox(pred_bbox, [gt['width'], gt['height']])
82 |                 # print('pred:',pred_bbox,'gt:',gt_bbox)
83 | 
84 |                 iou = compute_iou(pred_bbox, gt_bbox)
85 |                 if iou >= iou_thresh:
86 |                     tp += 1
87 |                     break
88 |                 else:
89 |                     if i == num_bboxs - 1:
90 |                         fp += 1
91 |         except:
92 |             print(pred)
93 |             fp += 1
94 |     precision = tp / (tp + fp)
95 |     print(f'==== REC RESULT: precision = {precision}, tp = {tp}, fp = {fp}')
96 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 | 
 86 |         if isinstance(inst['caption'], list):
 87 |             cap_str = '\n'.join(inst['caption'])
 88 |         else:
 89 |             cap_str = inst['caption']
 90 | 
 91 |         category = 'llava_bench_' + json.loads(ques_js)['category']
 92 |         if category in rule_dict:
 93 |             rule = rule_dict[category]
 94 |         else:
 95 |             assert False, f"Visual QA category not found in rule file: {category}."
 96 |         prompt = rule['prompt']
 97 |         role = rule['role']
 98 |         content = (f'[Context]\n{cap_str}\n\n'
 99 |                    f'[Question]\n{ques["text"]}\n\n'
100 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102 |                    f'[System]\n{prompt}\n\n')
103 |         cur_js = {
104 |             'id': idx+1,
105 |             'question_id': ques['question_id'],
106 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
107 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108 |             'category': category
109 |         }
110 |         if idx >= len(cur_reviews):
111 |             review = get_eval(content, args.max_tokens)
112 |             scores = parse_score(review)
113 |             cur_js['content'] = review
114 |             cur_js['tuple'] = scores
115 |             review_file.write(json.dumps(cur_js) + '\n')
116 |             review_file.flush()
117 |         else:
118 |             print(f'Skipping {idx} as we already have it.')
119 |         idx += 1
120 |         print(idx)
121 |     review_file.close()
122 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 |         cap_str = '\n'.join(inst['captions'])
 86 |         box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             assert False, f"Visual QA category not found in rule file: {category}."
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
 96 |                    f'[Question]\n{ques["text"]}\n\n'
 97 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 98 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 99 |                    f'[System]\n{prompt}\n\n')
100 |         cur_js = {
101 |             'id': idx+1,
102 |             'question_id': ques['question_id'],
103 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
104 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105 |             'category': category
106 |         }
107 |         if idx >= len(cur_reviews):
108 |             review = get_eval(content, args.max_tokens)
109 |             scores = parse_score(review)
110 |             cur_js['content'] = review
111 |             cur_js['tuple'] = scores
112 |             review_file.write(json.dumps(cur_js) + '\n')
113 |             review_file.flush()
114 |         else:
115 |             print(f'Skipping {idx} as we already have it.')
116 |         idx += 1
117 |         print(idx)
118 |     review_file.close()
119 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 |     return f1
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument("--annotation-dir", type=str)
68 |     parser.add_argument("--question-file", type=str)
69 |     parser.add_argument("--result-file", type=str)
70 |     args = parser.parse_args()
71 | 
72 |     questions = [json.loads(line) for line in open(args.question_file)]
73 |     questions = {question['question_id']: question for question in questions}
74 |     answers = [json.loads(q) for q in open(args.result_file)]
75 | 
76 |     f1s = []
77 |     for file in os.listdir(args.annotation_dir):
78 |         assert file.startswith('coco_pope_')
79 |         assert file.endswith('.json')
80 |         category = file[10:-5]
81 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
82 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
83 |         f1 = eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
84 |         f1s.append(f1)
85 |         print("====================================")
86 |     print("f1 mean:", sum(f1s) / len(f1s))
87 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_science_qa_gpt4_requery.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--requery-result', type=str)
 14 |     parser.add_argument('--our-result', type=str)
 15 |     parser.add_argument('--output-result', type=str)
 16 |     parser.add_argument('--split', type=str, default='test')
 17 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 18 |     return parser.parse_args()
 19 | 
 20 | 
 21 | def convert_caps(results):
 22 |     fakecaps = []
 23 |     for result in results:
 24 |         image_id = result['question_id']
 25 |         caption = result['text']
 26 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 27 |     return fakecaps
 28 | 
 29 | 
 30 | def get_pred_idx(prediction, choices, options):
 31 |     """
 32 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 33 |     """
 34 |     if prediction in options[:len(choices)]:
 35 |         return options.index(prediction)
 36 |     else:
 37 |         return random.choice(range(len(choices)))
 38 | 
 39 | 
 40 | if __name__ == "__main__":
 41 |     args = get_args()
 42 | 
 43 |     base_dir = args.base_dir
 44 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 45 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 46 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 47 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 48 |     split_problems = {idx: problems[idx] for idx in split_indices}
 49 | 
 50 |     requery_predictions = [json.loads(line) for line in open(args.requery_result)]
 51 |     requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
 52 | 
 53 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 54 | 
 55 |     results = defaultdict(lambda: 0)
 56 | 
 57 |     sqa_results = {}
 58 |     sqa_results['acc'] = None
 59 |     sqa_results['correct'] = None
 60 |     sqa_results['count'] = None
 61 |     sqa_results['results'] = {}
 62 |     sqa_results['outputs'] = {}
 63 | 
 64 |     for prob_id, prob in split_problems.items():
 65 |         if prob_id not in our_predictions:
 66 |             assert False
 67 |         if prob_id not in gpt4_predictions:
 68 |             assert False
 69 |         our_pred = our_predictions[prob_id]['text']
 70 |         gpt4_pred = gpt4_predictions[prob_id]
 71 |         if prob_id not in requery_predictions:
 72 |             results['missing_requery'] += 1
 73 |             requery_pred = "MISSING"
 74 |         else:
 75 |             requery_pred = requery_predictions[prob_id]['text']
 76 | 
 77 |         pattern = re.compile(r'The answer is ([A-Z]).')
 78 |         our_res = pattern.findall(our_pred)
 79 |         if len(our_res) == 1:
 80 |             our_answer = our_res[0]  # 'A', 'B', ...
 81 |         else:
 82 |             our_answer = "FAILED"
 83 | 
 84 |         requery_res = pattern.findall(requery_pred)
 85 |         if len(requery_res) == 1:
 86 |             requery_answer = requery_res[0]  # 'A', 'B', ...
 87 |         else:
 88 |             requery_answer = "FAILED"
 89 | 
 90 |         gpt4_res = pattern.findall(gpt4_pred)
 91 |         if len(gpt4_res) == 1:
 92 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 93 |         else:
 94 |             gpt4_answer = "FAILED"
 95 | 
 96 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 97 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 98 |         requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
 99 | 
100 |         results['total'] += 1
101 | 
102 |         if gpt4_answer == 'FAILED':
103 |             results['gpt4_failed'] += 1
104 |             if gpt4_pred_idx == prob['answer']:
105 |                 results['gpt4_correct'] += 1
106 |             if our_pred_idx == prob['answer']:
107 |                 results['gpt4_ourvisual_correct'] += 1
108 |         elif gpt4_pred_idx == prob['answer']:
109 |             results['gpt4_correct'] += 1
110 |             results['gpt4_ourvisual_correct'] += 1
111 | 
112 |         if our_pred_idx == prob['answer']:
113 |             results['our_correct'] += 1
114 | 
115 |         if requery_answer == 'FAILED':
116 |             sqa_results['results'][prob_id] = our_pred_idx
117 |             if our_pred_idx == prob['answer']:
118 |                 results['requery_correct'] += 1
119 |         else:
120 |             sqa_results['results'][prob_id] = requery_pred_idx
121 |             if requery_pred_idx == prob['answer']:
122 |                 results['requery_correct'] += 1
123 |             else:
124 |                 print(f"""
125 | Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
126 | Our ({our_answer}): {our_pred}
127 | GPT-4 ({gpt4_answer}): {gpt4_pred}
128 | Requery ({requery_answer}): {requery_pred}
129 | print("=====================================")
130 | """)
131 | 
132 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
133 |             results['correct_upperbound'] += 1
134 | 
135 |     total = results['total']
136 |     print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
137 |     print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
138 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
139 |     print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
140 |     print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
141 |     print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
142 | 
143 |     sqa_results['acc'] = results["requery_correct"] / total * 100
144 |     sqa_results['correct'] = results["requery_correct"]
145 |     sqa_results['count'] = total
146 | 
147 |     with open(args.output_result, 'w') as f:
148 |         json.dump(sqa_results, f, indent=2)
149 | 
150 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from qh360_vl.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 |         print(result['text'],'<======================>',annotation['answers'])
50 | 
51 |     evaluator = TextVQAAccuracyEvaluator()
52 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 | 
58 |     if args.result_file is not None:
59 |         eval_single(args.annotation_file, args.result_file)
60 | 
61 |     if args.result_dir is not None:
62 |         for result_file in sorted(os.listdir(args.result_dir)):
63 |             if not result_file.endswith('.jsonl'):
64 |                 print(f'Skipping {result_file}')
65 |                 continue
66 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
67 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/infer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | import pdb
 19 | import sys
 20 | from pprint import pprint as pp
 21 | 
 22 | g_input_msg = [
 23 |     {
 24 |         "role": "system", 
 25 |         "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 26 |     }
 27 | ]
 28 | 
 29 | 
 30 | def get_input(tokenizer, image_processor, model_config, rounds, query, args):
 31 |         g_input_msg.append({
 32 |             "role": "user", 
 33 |             "content": ("<|reserved_special_token_44|>"+ '\n' if not rounds else "") + query
 34 |         })
 35 |         
 36 |         input_ids = tokenizer.apply_chat_template(
 37 |             g_input_msg,
 38 |             add_generation_prompt=True,
 39 |             padding="longest",
 40 |             return_tensors="pt",
 41 |         )
 42 |         input_id_list = input_ids[0].tolist()
 43 |         input_id_list[input_id_list.index(128049)]=-200
 44 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 45 | 
 46 |         image = Image.open(args.image_path).convert('RGB')
 47 |         if args.slide_window:
 48 |             image_tensor = process_images_slid_window(image, image_processor, model_config, None, None, 336)
 49 |         else:
 50 |             image_tensor = process_images([image], image_processor, model_config)[0]
 51 | 
 52 |         return input_ids.unsqueeze(0), image_tensor.unsqueeze(0)
 53 | 
 54 |     
 55 | def infer_model(args):
 56 |     # Model
 57 |     disable_torch_init()
 58 |     model_path = os.path.expanduser(args.model_path)
 59 |     model_name = get_model_name_from_path(model_path)
 60 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 61 |     tokenizer.pad_token = tokenizer.eos_token
 62 |     
 63 |     rounds = 0
 64 |     while 1:
 65 |         try:
 66 |             query = input("user: ")
 67 |             if query == "exit":
 68 |                 break
 69 |         except:
 70 |             continue
 71 |             
 72 |         input_ids, image_tensor = get_input(tokenizer, image_processor, model.config, rounds, query, args)
 73 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
 74 |         
 75 |         with torch.inference_mode():
 76 |             output_ids = model.generate(
 77 |                 input_ids,
 78 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
 79 |                 do_sample=True if args.temperature > 0 else False,
 80 |                 temperature=args.temperature,
 81 |                 eos_token_id=[tokenizer.convert_tokens_to_ids("<|eot_id|>",)],
 82 |                 top_p=args.top_p,
 83 |                 num_beams=args.num_beams,
 84 |                 max_new_tokens=128,
 85 |                 use_cache=True)
 86 | 
 87 |         input_token_len = input_ids.shape[1]
 88 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 89 |         if n_diff_input_output > 0:
 90 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 91 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 92 |         outputs = outputs.strip()
 93 |         print("qh360_vl:", outputs)
 94 |         
 95 |         g_input_msg.append({
 96 |             "role": "assistant", 
 97 |             "content": outputs
 98 |         })
 99 |         rounds += 1
100 |         
101 | 
102 |         
103 | 
104 | if __name__ == "__main__":
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument("--model-path", type=str, default=None)
107 |     parser.add_argument("--image-path", type=str, default=None)
108 |     parser.add_argument("--model-base", type=str, default=None)
109 |     parser.add_argument("--temperature", type=float, default=0)
110 |     parser.add_argument("--top_p", type=float, default=None)
111 |     parser.add_argument("--num_beams", type=int, default=1)
112 |     parser.add_argument("--slide_window", action="store_true")
113 |     args = parser.parse_args()
114 |     
115 |     infer_model(args)


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 10 | from qh360_vl.model.builder import load_pretrained_model
 11 | from qh360_vl.utils import disable_torch_init
 12 | from qh360_vl.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria,process_images_slid_window
 13 | 
 14 | from PIL import Image
 15 | import math
 16 | 
 17 | 
 18 | def split_list(lst, n):
 19 |     """Split a list into n (roughly) equal-sized chunks"""
 20 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 21 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 22 | 
 23 | 
 24 | def get_chunk(lst, n, k):
 25 |     chunks = split_list(lst, n)
 26 |     return chunks[k]
 27 | 
 28 | 
 29 | def eval_model(args):
 30 |     # Model
 31 |     disable_torch_init()
 32 |     model_path = os.path.expanduser(args.model_path)
 33 |     model_name = get_model_name_from_path(model_path)
 34 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 35 |     tokenizer.pad_token = tokenizer.eos_token
 36 | 
 37 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 38 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 39 |     answers_file = os.path.expanduser(args.answers_file)
 40 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 41 |     ans_file = open(answers_file, "w")
 42 |     for line in tqdm(questions):
 43 |         idx = line["question_id"]
 44 |         image_file = line["image"]
 45 |         qs = line["text"]
 46 |         cur_prompt = qs
 47 |         if model.config.mm_use_im_start_end:
 48 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 49 |         else:
 50 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 51 | 
 52 |         input_msg = [
 53 |             {
 54 |                 "role": "system", 
 55 |                 "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 56 |             },
 57 |             {
 58 |                 "role": "user", 
 59 |                 "content": qs
 60 |             }
 61 |         ]
 62 |         input_ids = tokenizer.apply_chat_template(
 63 |             input_msg,
 64 |             add_generation_prompt=True,
 65 |             padding="longest",
 66 |             return_tensors="pt",
 67 |         )
 68 |         input_id_list = input_ids[0].tolist()
 69 |         input_id_list[input_id_list.index(128049)]=-200
 70 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device).unsqueeze(0).cuda()
 71 | 
 72 |         image = Image.open(os.path.join(args.image_folder, image_file))
 73 | 
 74 |         if args.slide_window:
 75 |             image_tensor = process_images_slid_window(image, image_processor, model.config, None, None, 336)
 76 |         else:
 77 |             image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 78 | 
 79 |         terminators = [
 80 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
 81 |         ]
 82 | 
 83 |         with torch.inference_mode():
 84 |             output_ids = model.generate(
 85 |                 input_ids,
 86 |                 images=image_tensor.unsqueeze(0).half().cuda(),
 87 |                 do_sample=True if args.temperature > 0 else False,
 88 |                 temperature=args.temperature,
 89 |                 eos_token_id=terminators,
 90 |                 top_p=args.top_p,
 91 |                 num_beams=args.num_beams,
 92 |                 # no_repeat_ngram_size=3,
 93 |                 max_new_tokens=1024,
 94 |                 use_cache=True)
 95 | 
 96 |         input_token_len = input_ids.shape[1]
 97 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
 98 |         if n_diff_input_output > 0:
 99 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
100 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
101 | 
102 |         outputs = outputs.strip()
103 | 
104 |         ans_id = shortuuid.uuid()
105 |         ans_file.write(json.dumps({"question_id": idx,
106 |                                    "prompt": cur_prompt,
107 |                                    "text": outputs,
108 |                                    "answer_id": ans_id,
109 |                                    "model_id": model_name,
110 |                                    "metadata": {}}) + "\n")
111 |         ans_file.flush()
112 |     ans_file.close()
113 | 
114 | if __name__ == "__main__":
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
117 |     parser.add_argument("--model-base", type=str, default=None)
118 |     parser.add_argument("--image-folder", type=str, default="")
119 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
120 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
121 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
122 |     parser.add_argument("--num-chunks", type=int, default=1)
123 |     parser.add_argument("--chunk-idx", type=int, default=0)
124 |     parser.add_argument("--temperature", type=float, default=0.2)
125 |     parser.add_argument("--top_p", type=float, default=None)
126 |     parser.add_argument("--num_beams", type=int, default=1)
127 |     parser.add_argument("--slide_window", action="store_true")
128 |     args = parser.parse_args()
129 | 
130 |     eval_model(args)
131 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_loader_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 | 
 46 |         input_msg = [
 47 |             {
 48 |             "role": "system", 
 49 |             "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 50 |             },
 51 |             {
 52 |                 "role": "user", 
 53 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 54 |             }
 55 |         ]
 56 |         input_ids = self.tokenizer.apply_chat_template(
 57 |             input_msg,
 58 |             add_generation_prompt=True,
 59 |             padding="longest",
 60 |             return_tensors="pt",
 61 |         )
 62 | 
 63 |         input_id_list = input_ids[0].tolist()
 64 |         input_id_list[input_id_list.index(128049)]=-200
 65 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 66 | 
 67 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 68 | 
 69 |         if self.args.slide_window:
 70 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 71 |         else:
 72 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 73 |             
 74 |         return input_ids, image_tensor
 75 | 
 76 |     def __len__(self):
 77 |         return len(self.questions)
 78 | 
 79 | 
 80 | # DataLoader
 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 82 |     assert batch_size == 1, "batch_size must be 1"
 83 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 84 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 85 |     return data_loader
 86 | 
 87 | 
 88 | def eval_model(args):
 89 |     # Model
 90 |     torch.distributed.init_process_group(
 91 |         backend='nccl',
 92 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
 93 |         rank=int(os.getenv('RANK', '0')),
 94 |     )
 95 | 
 96 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 97 | 
 98 |     disable_torch_init()
 99 |     model_path = os.path.expanduser(args.model_path)
100 |     model_name = get_model_name_from_path(model_path)
101 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
102 |     tokenizer.pad_token = tokenizer.eos_token
103 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
104 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
105 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
106 |     
107 |     answers_file = os.path.expanduser(args.answers_file)
108 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
109 |     ans_file = open(answers_file, "w")
110 | 
111 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
112 |         args.conv_mode = args.conv_mode + '_mmtag'
113 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
114 | 
115 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
116 | 
117 |     all_outputs = []
118 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
119 |         idx = line["question_id"]
120 |         cur_prompt = line["text"]
121 | 
122 |         # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
123 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
124 |         terminators = [
125 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
126 |         ]
127 | 
128 |         with torch.inference_mode():
129 |             output_ids = model.generate(
130 |                 input_ids,
131 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
132 |                 do_sample=True if args.temperature > 0 else False,
133 |                 temperature=args.temperature,
134 |                 eos_token_id=terminators,
135 |                 top_p=args.top_p,
136 |                 num_beams=args.num_beams,
137 |                 max_new_tokens=1280,
138 |                 use_cache=True)
139 | 
140 |         input_token_len = input_ids.shape[1]
141 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
142 |         if n_diff_input_output > 0:
143 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
144 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
145 |         outputs = outputs.strip()
146 |         print(outputs)
147 | 
148 |         ans_id = shortuuid.uuid()
149 | 
150 |         all_outputs.append({"question_id": idx,
151 |                                    "prompt": cur_prompt,
152 |                                    "text": outputs,
153 |                                    "answer_id": ans_id,
154 |                                    "model_id": model_name,
155 |                                    "metadata": {}})
156 | 
157 |     torch.distributed.barrier()
158 | 
159 |     world_size = torch.distributed.get_world_size()
160 |     merged_outputs = [None for _ in range(world_size)]
161 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
162 | 
163 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
164 | 
165 |     if torch.distributed.get_rank() == 0:
166 |         for item in merged_outputs:
167 |             ans_file.write(json.dumps(item) + "\n")
168 |     ans_file.close()
169 |     torch.distributed.barrier()
170 | 
171 | if __name__ == "__main__":
172 |     parser = argparse.ArgumentParser()
173 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
174 |     parser.add_argument("--model-base", type=str, default=None)
175 |     parser.add_argument("--image-folder", type=str, default="")
176 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
177 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
178 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
179 |     parser.add_argument("--num-chunks", type=int, default=1)
180 |     parser.add_argument("--chunk-idx", type=int, default=0)
181 |     parser.add_argument("--temperature", type=float, default=0.2)
182 |     parser.add_argument("--top_p", type=float, default=None)
183 |     parser.add_argument("--num_beams", type=int, default=1)
184 |     parser.add_argument("--slide_window", action="store_true")
185 |     args = parser.parse_args()
186 | 
187 |     eval_model(args)
188 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_loader_llama3_nodist.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 |         input_msg = [
 46 |             {
 47 |             "role": "system", 
 48 |             "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 49 |             },
 50 |             {
 51 |                 "role": "user", 
 52 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 53 |             }
 54 |         ]
 55 |         input_ids = self.tokenizer.apply_chat_template(
 56 |             input_msg,
 57 |             add_generation_prompt=True,
 58 |             padding="longest",
 59 |             return_tensors="pt",
 60 |         )
 61 | 
 62 |         input_id_list = input_ids[0].tolist()
 63 |         input_id_list[input_id_list.index(128049)]=-200
 64 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 65 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 66 |         if self.args.slide_window:
 67 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 68 |         else:
 69 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 70 |             
 71 |         return input_ids, image_tensor
 72 | 
 73 | 
 74 |     def __len__(self):
 75 |         return len(self.questions)
 76 | 
 77 | 
 78 | # DataLoader
 79 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 80 |     assert batch_size == 1, "batch_size must be 1"
 81 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 82 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 83 |     return data_loader
 84 | 
 85 | 
 86 | def eval_model(args):
 87 |     # Model
 88 |     disable_torch_init()
 89 |     model_path = os.path.expanduser(args.model_path)
 90 |     model_name = get_model_name_from_path(model_path)
 91 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 92 |     tokenizer.pad_token = tokenizer.eos_token
 93 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 94 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 95 | 
 96 |     answers_file = os.path.expanduser(args.answers_file)
 97 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 98 |     ans_file = open(answers_file, "w")
 99 | 
100 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
101 |         args.conv_mode = args.conv_mode + '_mmtag'
102 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
103 | 
104 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
105 | 
106 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
107 |         idx = line["question_id"]
108 |         cur_prompt = line["text"]
109 | 
110 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
111 |         terminators = [
112 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
113 |         ]
114 | 
115 |         with torch.inference_mode():
116 |             output_ids = model.generate(
117 |                 input_ids,
118 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
119 |                 do_sample=True if args.temperature > 0 else False,
120 |                 temperature=args.temperature,
121 |                 eos_token_id=terminators,
122 |                 top_p=args.top_p,
123 |                 num_beams=args.num_beams,
124 |                 max_new_tokens=1280,
125 |                 use_cache=True)
126 | 
127 |         input_token_len = input_ids.shape[1]
128 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
129 |         if n_diff_input_output > 0:
130 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
131 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
132 |         outputs = outputs.strip()
133 |         print(outputs)
134 | 
135 |         ans_id = shortuuid.uuid()
136 |         ans_file.write(json.dumps({"question_id": idx,
137 |                                    "prompt": cur_prompt,
138 |                                    "text": outputs,
139 |                                    "answer_id": ans_id,
140 |                                    "model_id": model_name,
141 |                                    "metadata": {}}) + "\n")
142 | 
143 |     ans_file.close()
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     parser = argparse.ArgumentParser()
148 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
149 |     parser.add_argument("--model-base", type=str, default=None)
150 |     parser.add_argument("--image-folder", type=str, default="")
151 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
152 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
153 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
154 |     parser.add_argument("--num-chunks", type=int, default=1)
155 |     parser.add_argument("--chunk-idx", type=int, default=0)
156 |     parser.add_argument("--temperature", type=float, default=0.2)
157 |     parser.add_argument("--top_p", type=float, default=None)
158 |     parser.add_argument("--num_beams", type=int, default=1)
159 |     parser.add_argument("--slide_window", action="store_true")
160 |     args = parser.parse_args()
161 | 
162 |     eval_model(args)
163 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_loader_raw.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 | 
 46 |         input_msg = [
 47 |             {
 48 |                 "role": "system", 
 49 |                 "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 50 |             },
 51 |             {
 52 |                 "role": "user", 
 53 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 54 |             }
 55 |         ]
 56 |         input_ids = self.tokenizer.apply_chat_template(
 57 |             input_msg,
 58 |             add_generation_prompt=True,
 59 |             padding="longest",
 60 |             return_tensors="pt",
 61 |         )
 62 | 
 63 |         input_id_list = input_ids[0].tolist()
 64 |         input_id_list[input_id_list.index(128049)]=-200
 65 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 66 | 
 67 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 68 | 
 69 |         if self.args.slide_window:
 70 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 71 |         else:
 72 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 73 |           
 74 |         return input_ids, image_tensor
 75 | 
 76 |     def __len__(self):
 77 |         return len(self.questions)
 78 | 
 79 | 
 80 | # DataLoader
 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 82 |     assert batch_size == 1, "batch_size must be 1"
 83 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 84 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 85 |     return data_loader
 86 | 
 87 | 
 88 | def eval_model(args):
 89 |     # Model
 90 |     disable_torch_init()
 91 |     model_path = os.path.expanduser(args.model_path)
 92 |     model_name = get_model_name_from_path(model_path)
 93 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 94 |     tokenizer.pad_token = tokenizer.eos_token
 95 | 
 96 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 97 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 98 |     answers_file = os.path.expanduser(args.answers_file)
 99 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
100 |     ans_file = open(answers_file, "w")
101 | 
102 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:  # False
103 |         args.conv_mode = args.conv_mode + '_mmtag'
104 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
105 | 
106 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
107 | 
108 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
109 |         idx = line["question_id"]
110 |         cur_prompt = line["text"]
111 | 
112 | #         stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2  # '</s>'
113 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
114 |         terminators = [
115 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
116 |         ]
117 | 
118 |         with torch.inference_mode():
119 |             output_ids = model.generate(
120 |                 input_ids,
121 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
122 |                 do_sample=True if args.temperature > 0 else False,
123 |                 temperature=args.temperature,
124 |                 eos_token_id=terminators,
125 |                 top_p=args.top_p,
126 |                 num_beams=args.num_beams,
127 |                 max_new_tokens=128,
128 |                 use_cache=True)
129 | 
130 |         input_token_len = input_ids.shape[1]
131 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
132 |         if n_diff_input_output > 0:
133 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
134 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
135 |         outputs = outputs.strip()
136 | 
137 |         ans_id = shortuuid.uuid()
138 |         ans_file.write(json.dumps({"question_id": idx,
139 |                                    "prompt": cur_prompt,
140 |                                    "text": outputs,
141 |                                    "answer_id": ans_id,
142 |                                    "model_id": model_name,
143 |                                    "metadata": {}},
144 |                                     ensure_ascii=False) + "\n")
145 |         ans_file.flush()
146 |     ans_file.close()
147 | 
148 | if __name__ == "__main__":
149 |     parser = argparse.ArgumentParser()
150 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
151 |     parser.add_argument("--model-base", type=str, default=None)
152 |     parser.add_argument("--image-folder", type=str, default="")
153 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
154 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
155 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
156 |     parser.add_argument("--num-chunks", type=int, default=1)
157 |     parser.add_argument("--chunk-idx", type=int, default=0)
158 |     parser.add_argument("--temperature", type=float, default=0.2)
159 |     parser.add_argument("--top_p", type=float, default=None)
160 |     parser.add_argument("--num_beams", type=int, default=1)
161 |     parser.add_argument("--slide_window", action="store_true")
162 |     args = parser.parse_args()
163 | 
164 |     eval_model(args)
165 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_mmbench_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | from tqdm import tqdm
  7 | import shortuuid
  8 | import itertools
  9 | 
 10 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 11 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 12 | from qh360_vl.model.builder import load_pretrained_model
 13 | from qh360_vl.utils import disable_torch_init
 14 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path,process_images_slid_window
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | all_options = ['A', 'B', 'C', 'D']
 21 | 
 22 | 
 23 | def split_list(lst, n):
 24 |     """Split a list into n (roughly) equal-sized chunks"""
 25 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 26 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 27 | 
 28 | 
 29 | def get_chunk(lst, n, k):
 30 |     chunks = split_list(lst, n)
 31 |     return chunks[k]
 32 | 
 33 | 
 34 | def is_none(value):
 35 |     if value is None:
 36 |         return True
 37 |     if type(value) is float and math.isnan(value):
 38 |         return True
 39 |     if type(value) is str and value.lower() == 'nan':
 40 |         return True
 41 |     if type(value) is str and value.lower() == 'none':
 42 |         return True
 43 |     return False
 44 | 
 45 | def get_options(row, options):
 46 |     parsed_options = []
 47 |     for option in options:
 48 |         option_value = row[option]
 49 |         if is_none(option_value):
 50 |             break
 51 |         parsed_options.append(option_value)
 52 |     return parsed_options
 53 | 
 54 | 
 55 | def eval_model(args):
 56 |     # Model
 57 | 
 58 |     torch.distributed.init_process_group(
 59 |         backend='nccl',
 60 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
 61 |         rank=int(os.getenv('RANK', '0')),
 62 |     )
 63 | 
 64 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 65 | 
 66 |     disable_torch_init()
 67 |     model_path = os.path.expanduser(args.model_path)
 68 |     model_name = get_model_name_from_path(model_path)
 69 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 70 |     tokenizer.pad_token = tokenizer.eos_token
 71 | 
 72 |     questions = pd.read_table(os.path.expanduser(args.question_file))
 73 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 74 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
 75 |     
 76 |     answers_file = os.path.expanduser(args.answers_file)
 77 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 78 |     ans_file = open(answers_file, "w")
 79 | 
 80 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
 81 |         args.conv_mode = args.conv_mode + '_mmtag'
 82 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
 83 |     
 84 |     all_outputs = []
 85 |     for index, row in tqdm(questions.iterrows(), total=len(questions)):
 86 |         options = get_options(row, all_options)
 87 |         cur_option_char = all_options[:len(options)]
 88 | 
 89 |         if args.all_rounds:
 90 |             num_rounds = len(options)
 91 |         else:
 92 |             num_rounds = 1
 93 | 
 94 |         for round_idx in range(num_rounds):
 95 |             idx = row['index']
 96 |             question = row['question']
 97 |             hint = row['hint']
 98 |             image = load_image_from_base64(row['image'])
 99 |             if not is_none(hint):
100 |                 question = hint + '\n' + question
101 |             for option_char, option in zip(all_options[:len(options)], options):
102 |                 question = question + '\n' + option_char + '. ' + option
103 |             qs = cur_prompt = question
104 | 
105 |             qs = "<|reserved_special_token_44|>" + '\n' + qs
106 | 
107 |             if args.single_pred_prompt:
108 |                 if args.lang == 'cn':
109 |                     qs = qs + '\n' + "请直接回答选项字母。"
110 |                 else:
111 |                     qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
112 |             input_msg = [
113 |                 {
114 |                     "role": "system", 
115 |                     "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
116 |                 },
117 |                 {
118 |                     "role": "user", 
119 |                     "content": qs
120 |                 }
121 |             ]
122 |             input_ids = tokenizer.apply_chat_template(
123 |                 input_msg,
124 |                 add_generation_prompt=True,
125 |                 padding="longest",
126 |                 return_tensors="pt",
127 |             )
128 |             input_id_list = input_ids[0].tolist()
129 |             input_id_list[input_id_list.index(128049)]=-200
130 |             input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device).unsqueeze(0).cuda()
131 |             if args.slide_window:
132 |                 image_tensor = process_images_slid_window(image, image_processor, model.config, None, None, 336)
133 |             else:
134 |                 image_tensor = process_images([image], image_processor, model.config)[0]
135 |                 
136 |             terminators = [
137 |                 tokenizer.convert_tokens_to_ids("<|eot_id|>",)
138 |             ]
139 | 
140 |             with torch.inference_mode():
141 |                 output_ids = model.generate(
142 |                     input_ids,
143 |                     images=image_tensor.unsqueeze(0).half().cuda(),
144 |                     do_sample=True if args.temperature > 0 else False,
145 |                     temperature=args.temperature,
146 |                     eos_token_id=terminators,
147 |                     top_p=args.top_p,
148 |                     num_beams=args.num_beams,
149 |                     # no_repeat_ngram_size=3,
150 |                     max_new_tokens=1024,
151 |                     use_cache=True)
152 | 
153 |             input_token_len = input_ids.shape[1]
154 |             n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
155 |             if n_diff_input_output > 0:
156 |                 print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
157 |             outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
158 |             outputs = outputs.strip()
159 |             print(outputs)
160 | 
161 |             ans_id = shortuuid.uuid()
162 |             all_outputs.append({"question_id": idx,
163 |                                     "round_id": round_idx,
164 |                                     "prompt": cur_prompt,
165 |                                     "text": outputs,
166 |                                     "options": options,
167 |                                     "option_char": cur_option_char,
168 |                                     "answer_id": ans_id,
169 |                                     "model_id": model_name,
170 |                                     "metadata": {}})
171 |             # rotate options
172 |             options = options[1:] + options[:1]
173 |             cur_option_char = cur_option_char[1:] + cur_option_char[:1]
174 | 
175 |     torch.distributed.barrier()
176 | 
177 |     world_size = torch.distributed.get_world_size()
178 |     merged_outputs = [None for _ in range(world_size)]
179 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
180 | 
181 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
182 | 
183 |     if torch.distributed.get_rank() == 0:
184 |         for item in merged_outputs:
185 |             ans_file.write(json.dumps(item) + "\n")
186 |     ans_file.close()
187 |     torch.distributed.barrier()
188 | 
189 | if __name__ == "__main__":
190 |     parser = argparse.ArgumentParser()
191 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
192 |     parser.add_argument("--model-base", type=str, default=None)
193 |     parser.add_argument("--image-folder", type=str, default="")
194 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
195 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
196 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
197 |     parser.add_argument("--num-chunks", type=int, default=1)
198 |     parser.add_argument("--chunk-idx", type=int, default=0)
199 |     parser.add_argument("--temperature", type=float, default=0.2)
200 |     parser.add_argument("--top_p", type=float, default=None)
201 |     parser.add_argument("--num_beams", type=int, default=1)
202 |     parser.add_argument("--all-rounds", action="store_true")
203 |     parser.add_argument("--single-pred-prompt", action="store_true")
204 |     parser.add_argument("--lang", type=str, default="en")
205 |     parser.add_argument("--slide_window", action="store_true")
206 |     args = parser.parse_args()
207 | 
208 |     eval_model(args)
209 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_mme_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 |         qs = qs.replace("\nAnswer the question using a single word or phrase.", " Please answer yes or no.") # open compass
 46 | 
 47 |         input_msg = [
 48 |             {
 49 |             "role": "system", 
 50 |             "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 51 |             },
 52 |             {
 53 |                 "role": "user", 
 54 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 55 |             }
 56 |         ]
 57 |         input_ids = self.tokenizer.apply_chat_template(
 58 |             input_msg,
 59 |             add_generation_prompt=True,
 60 |             padding="longest",
 61 |             return_tensors="pt",
 62 |         )
 63 | 
 64 |         input_id_list = input_ids[0].tolist()
 65 |         input_id_list[input_id_list.index(128049)]=-200
 66 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 67 | 
 68 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 69 | 
 70 |         if self.args.slide_window:
 71 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 72 |         else:
 73 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 74 |             
 75 |         return input_ids, image_tensor
 76 | 
 77 |     def __len__(self):
 78 |         return len(self.questions)
 79 | 
 80 | 
 81 | # DataLoader
 82 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 83 |     assert batch_size == 1, "batch_size must be 1"
 84 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 85 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 86 |     return data_loader
 87 | 
 88 | 
 89 | def eval_model(args):
 90 |     # Model
 91 |     torch.distributed.init_process_group(
 92 |         backend='nccl',
 93 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
 94 |         rank=int(os.getenv('RANK', '0')),
 95 |     )
 96 | 
 97 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 98 | 
 99 |     disable_torch_init()
100 |     model_path = os.path.expanduser(args.model_path)
101 |     model_name = get_model_name_from_path(model_path)
102 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
103 |     tokenizer.pad_token = tokenizer.eos_token
104 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
105 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
106 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
107 |     
108 |     answers_file = os.path.expanduser(args.answers_file)
109 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
110 |     ans_file = open(answers_file, "w")
111 | 
112 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
113 |         args.conv_mode = args.conv_mode + '_mmtag'
114 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
115 | 
116 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
117 | 
118 |     all_outputs = []
119 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
120 |         idx = line["question_id"]
121 |         cur_prompt = line["text"]
122 | 
123 |         # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
124 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
125 |         terminators = [
126 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
127 |         ]
128 | 
129 |         with torch.inference_mode():
130 |             output_ids = model.generate(
131 |                 input_ids,
132 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
133 |                 do_sample=True if args.temperature > 0 else False,
134 |                 temperature=args.temperature,
135 |                 eos_token_id=terminators,
136 |                 top_p=args.top_p,
137 |                 num_beams=args.num_beams,
138 |                 max_new_tokens=1280,
139 |                 use_cache=True)
140 | 
141 |         input_token_len = input_ids.shape[1]
142 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
143 |         if n_diff_input_output > 0:
144 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
145 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
146 |         outputs = outputs.strip()
147 |         print(outputs)
148 | 
149 |         ans_id = shortuuid.uuid()
150 | 
151 |         all_outputs.append({"question_id": idx,
152 |                                    "prompt": cur_prompt,
153 |                                    "text": outputs,
154 |                                    "answer_id": ans_id,
155 |                                    "model_id": model_name,
156 |                                    "metadata": {}})
157 | 
158 |     torch.distributed.barrier()
159 | 
160 |     world_size = torch.distributed.get_world_size()
161 |     merged_outputs = [None for _ in range(world_size)]
162 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
163 | 
164 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
165 | 
166 |     if torch.distributed.get_rank() == 0:
167 |         for item in merged_outputs:
168 |             ans_file.write(json.dumps(item) + "\n")
169 |     ans_file.close()
170 |     torch.distributed.barrier()
171 | 
172 | if __name__ == "__main__":
173 |     parser = argparse.ArgumentParser()
174 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
175 |     parser.add_argument("--model-base", type=str, default=None)
176 |     parser.add_argument("--image-folder", type=str, default="")
177 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
178 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
179 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
180 |     parser.add_argument("--num-chunks", type=int, default=1)
181 |     parser.add_argument("--chunk-idx", type=int, default=0)
182 |     parser.add_argument("--temperature", type=float, default=0.2)
183 |     parser.add_argument("--top_p", type=float, default=None)
184 |     parser.add_argument("--num_beams", type=int, default=1)
185 |     parser.add_argument("--slide_window", action="store_true")
186 |     args = parser.parse_args()
187 | 
188 |     eval_model(args)
189 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_pope_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 |         qs = qs.replace("\nAnswer the question using a single word or phrase.", " Please answer yes or no.") # open compass
 46 | 
 47 |         input_msg = [
 48 |             {
 49 |             "role": "system", 
 50 |             "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 51 |             },
 52 |             {
 53 |                 "role": "user", 
 54 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 55 |             }
 56 |         ]
 57 |         input_ids = self.tokenizer.apply_chat_template(
 58 |             input_msg,
 59 |             add_generation_prompt=True,
 60 |             padding="longest",
 61 |             return_tensors="pt",
 62 |         )
 63 | 
 64 |         input_id_list = input_ids[0].tolist()
 65 |         input_id_list[input_id_list.index(128049)]=-200
 66 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 67 | 
 68 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 69 | 
 70 |         if self.args.slide_window:
 71 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 72 |         else:
 73 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 74 |             
 75 |         return input_ids, image_tensor
 76 | 
 77 |     def __len__(self):
 78 |         return len(self.questions)
 79 | 
 80 | 
 81 | # DataLoader
 82 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 83 |     assert batch_size == 1, "batch_size must be 1"
 84 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 85 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 86 |     return data_loader
 87 | 
 88 | 
 89 | def eval_model(args):
 90 |     # Model
 91 |     torch.distributed.init_process_group(
 92 |         backend='nccl',
 93 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
 94 |         rank=int(os.getenv('RANK', '0')),
 95 |     )
 96 | 
 97 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 98 | 
 99 |     disable_torch_init()
100 |     model_path = os.path.expanduser(args.model_path)
101 |     model_name = get_model_name_from_path(model_path)
102 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
103 |     tokenizer.pad_token = tokenizer.eos_token
104 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
105 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
106 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
107 |     
108 |     answers_file = os.path.expanduser(args.answers_file)
109 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
110 |     ans_file = open(answers_file, "w")
111 | 
112 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
113 |         args.conv_mode = args.conv_mode + '_mmtag'
114 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
115 | 
116 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
117 | 
118 |     all_outputs = []
119 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
120 |         idx = line["question_id"]
121 |         cur_prompt = line["text"]
122 | 
123 |         # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
124 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
125 |         terminators = [
126 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
127 |         ]
128 | 
129 |         with torch.inference_mode():
130 |             output_ids = model.generate(
131 |                 input_ids,
132 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
133 |                 do_sample=True if args.temperature > 0 else False,
134 |                 temperature=args.temperature,
135 |                 eos_token_id=terminators,
136 |                 top_p=args.top_p,
137 |                 num_beams=args.num_beams,
138 |                 max_new_tokens=1280,
139 |                 use_cache=True)
140 | 
141 |         input_token_len = input_ids.shape[1]
142 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
143 |         if n_diff_input_output > 0:
144 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
145 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
146 |         outputs = outputs.strip()
147 |         print(outputs)
148 | 
149 |         ans_id = shortuuid.uuid()
150 | 
151 |         all_outputs.append({"question_id": idx,
152 |                                    "prompt": cur_prompt,
153 |                                    "text": outputs,
154 |                                    "answer_id": ans_id,
155 |                                    "model_id": model_name,
156 |                                    "metadata": {}})
157 | 
158 |     torch.distributed.barrier()
159 | 
160 |     world_size = torch.distributed.get_world_size()
161 |     merged_outputs = [None for _ in range(world_size)]
162 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
163 | 
164 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
165 | 
166 |     if torch.distributed.get_rank() == 0:
167 |         for item in merged_outputs:
168 |             ans_file.write(json.dumps(item) + "\n")
169 |     ans_file.close()
170 |     torch.distributed.barrier()
171 | 
172 | if __name__ == "__main__":
173 |     parser = argparse.ArgumentParser()
174 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
175 |     parser.add_argument("--model-base", type=str, default=None)
176 |     parser.add_argument("--image-folder", type=str, default="")
177 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
178 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
179 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
180 |     parser.add_argument("--num-chunks", type=int, default=1)
181 |     parser.add_argument("--chunk-idx", type=int, default=0)
182 |     parser.add_argument("--temperature", type=float, default=0.2)
183 |     parser.add_argument("--top_p", type=float, default=None)
184 |     parser.add_argument("--num_beams", type=int, default=1)
185 |     parser.add_argument("--slide_window", action="store_true")
186 |     args = parser.parse_args()
187 | 
188 |     eval_model(args)
189 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_refcoco_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | import random
 19 | 
 20 | 
 21 | def split_list(lst, n):
 22 |     """Split a list into n (roughly) equal-sized chunks"""
 23 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 24 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 25 | 
 26 | 
 27 | def get_chunk(lst, n, k):
 28 |     chunks = split_list(lst, n)
 29 |     return chunks[k]
 30 | 
 31 | 
 32 | # Custom dataset class
 33 | class CustomDataset(Dataset):
 34 |     def __init__(self, questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config, args):
 35 |         self.questions = questions
 36 |         self.image_folder = image_folder
 37 |         self.tokenizer = tokenizer
 38 |         self.image_processor = image_processor
 39 |         self.model_config = model_config
 40 | 
 41 |         self.is_all_img_resize_672 = is_all_img_resize_672
 42 |         self.args = args
 43 | 
 44 | 
 45 |     def __getitem__(self, index):
 46 |         line = self.questions[index]
 47 |         image_file = line["img_path"]
 48 | 
 49 |         qs = 'Please provide the bounding box coordinate of the region this sentence describes: '+line["expression"]
 50 | 
 51 |         input_msg = [
 52 |             {
 53 |                 "role": "system", 
 54 |                 "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 55 |             },
 56 |             {
 57 |                 "role": "user", 
 58 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 59 |             }
 60 |         ]
 61 |         input_ids = self.tokenizer.apply_chat_template(
 62 |             input_msg,
 63 |             add_generation_prompt=True,
 64 |             padding="longest",
 65 |             return_tensors="pt",
 66 |         )
 67 |         # print(input_ids)
 68 |         input_id_list = input_ids[0].tolist()
 69 |         input_id_list[input_id_list.index(128049)]=-200
 70 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 71 | 
 72 | 
 73 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 74 | 
 75 |         if self.args.slide_window:
 76 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 77 |         else:
 78 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 79 |             
 80 |         return input_ids, image_tensor
 81 | 
 82 |     def __len__(self):
 83 |         return len(self.questions)
 84 | 
 85 | 
 86 | 
 87 | # DataLoader
 88 | def create_data_loader(questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=8):
 89 |     assert batch_size == 1, "batch_size must be 1"
 90 |     dataset = CustomDataset(questions, image_folder, is_all_img_resize_672, tokenizer, image_processor, model_config,args)
 91 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=None,)
 92 |     return data_loader
 93 | 
 94 | 
 95 | def eval_model(args):
 96 |     # Model
 97 |     
 98 | 
 99 |     torch.distributed.init_process_group(
100 |         backend='nccl',
101 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
102 |         rank=int(os.getenv('RANK', '0')),
103 |     )
104 | 
105 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
106 | 
107 | 
108 |     disable_torch_init()
109 | 
110 |     model_path = os.path.expanduser(args.model_path)
111 |     model_name = get_model_name_from_path(model_path)
112 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
113 |     tokenizer.pad_token = tokenizer.eos_token
114 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
115 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
116 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
117 |     
118 |     answers_file = os.path.expanduser(args.answers_file)
119 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
120 |     ans_file = open(answers_file, "w")
121 | 
122 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
123 |         args.conv_mode = args.conv_mode + '_mmtag'
124 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
125 | 
126 |     data_loader = create_data_loader(questions, args.image_folder, args.is_all_img_resize_672, tokenizer, image_processor, model.config, args)
127 | 
128 |     
129 |     all_outputs = []
130 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
131 | 
132 |         idx = line["sent_id"]
133 |         cur_prompt = line["expression"]
134 | 
135 |         # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
136 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
137 |         terminators = [
138 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
139 |         ]
140 | 
141 |         with torch.inference_mode():
142 |             output_ids = model.generate(
143 |                 input_ids,
144 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
145 |                 do_sample=True if args.temperature > 0 else False,
146 |                 temperature=args.temperature,
147 |                 eos_token_id=terminators,
148 |                 top_p=args.top_p,
149 |                 num_beams=args.num_beams,
150 |                 max_new_tokens=128,
151 |                 use_cache=True)
152 | 
153 |         input_token_len = input_ids.shape[1]
154 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
155 |         if n_diff_input_output > 0:
156 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
157 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
158 |         outputs = outputs.strip()
159 | 
160 |         print('expression:', cur_prompt)
161 |         print('output:', outputs)
162 | 
163 |         ans_id = shortuuid.uuid()
164 |         all_outputs.append({"question_id": idx,
165 |                                    "prompt": cur_prompt,
166 |                                    "text": outputs,
167 |                                    "answer_id": ans_id,
168 |                                    "model_id": model_name,
169 |                                    "metadata": {}})
170 |         
171 |     torch.distributed.barrier()
172 | 
173 |     world_size = torch.distributed.get_world_size()
174 |     merged_outputs = [None for _ in range(world_size)]
175 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
176 | 
177 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
178 | 
179 |     if torch.distributed.get_rank() == 0:
180 |         for item in merged_outputs:
181 |             ans_file.write(json.dumps(item, ensure_ascii=False) + "\n")
182 |     ans_file.close()
183 |     torch.distributed.barrier()
184 | 
185 | if __name__ == "__main__":
186 |     parser = argparse.ArgumentParser()
187 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
188 |     parser.add_argument("--model-base", type=str, default=None)
189 |     parser.add_argument("--image-folder", type=str, default="")
190 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
191 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
192 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
193 |     parser.add_argument("--num-chunks", type=int, default=1)
194 |     parser.add_argument("--chunk-idx", type=int, default=0)
195 |     parser.add_argument("--temperature", type=float, default=0.2)
196 |     parser.add_argument("--patch_img_size", type=int, default=224)
197 |     parser.add_argument("--top_p", type=float, default=None)
198 |     parser.add_argument("--num_beams", type=int, default=1)
199 |     parser.add_argument("--is_all_img_resize_672", type=bool, default=False)
200 |     parser.add_argument("--slide_window", action="store_true")
201 |     args = parser.parse_args()
202 | 
203 |     eval_model(args)
204 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/model_vqa_textvqa_llama3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | import itertools
  8 | 
  9 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 10 | from qh360_vl.conversation import conv_templates, SeparatorStyle
 11 | from qh360_vl.model.builder import load_pretrained_model
 12 | from qh360_vl.utils import disable_torch_init
 13 | from qh360_vl.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path,process_images_slid_window
 14 | from torch.utils.data import Dataset, DataLoader
 15 | 
 16 | from PIL import Image
 17 | import math
 18 | 
 19 | 
 20 | def split_list(lst, n):
 21 |     """Split a list into n (roughly) equal-sized chunks"""
 22 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 23 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 24 | 
 25 | 
 26 | def get_chunk(lst, n, k):
 27 |     chunks = split_list(lst, n)
 28 |     return chunks[k]
 29 | 
 30 | 
 31 | # Custom dataset class
 32 | class CustomDataset(Dataset):
 33 |     def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, args):
 34 |         self.questions = questions
 35 |         self.image_folder = image_folder
 36 |         self.tokenizer = tokenizer
 37 |         self.image_processor = image_processor
 38 |         self.model_config = model_config
 39 |         self.args = args
 40 | 
 41 |     def __getitem__(self, index):
 42 |         line = self.questions[index]
 43 |         image_file = line["image"]
 44 |         qs = line["text"]
 45 | 
 46 |         input_msg = [
 47 |             {
 48 |             "role": "system", 
 49 |             "content": "You are a multilingual, helpful, respectful and honest assistant who can respond in the same language, depending on the language of the question. Try to be as helpful as possible while still being safe. Your answer should not contain anything that is false, unhealthy, harmful, immoral, racist, sexist, toxic, dangerous, or illegal, and if the question relates to such content, please decline to answer. Make sure your answer is socially fair and positive. If a question doesn't make any sense, or is inconsistent with the facts, explain why instead of answering the wrong answer. If you don't know the answer to a question, don't share false information."
 50 |             },
 51 |             {
 52 |                 "role": "user", 
 53 |                 "content": "<|reserved_special_token_44|>"+ '\n' + qs
 54 |             }
 55 |         ]
 56 |         input_ids = self.tokenizer.apply_chat_template(
 57 |             input_msg,
 58 |             add_generation_prompt=True,
 59 |             padding="longest",
 60 |             return_tensors="pt",
 61 |         )
 62 | 
 63 |         input_id_list = input_ids[0].tolist()
 64 |         input_id_list[input_id_list.index(128049)]=-200
 65 |         input_ids = torch.tensor(input_id_list, dtype=input_ids.dtype,device=input_ids.device)
 66 | 
 67 |         image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
 68 | 
 69 |         if self.args.slide_window:
 70 |             image_tensor = process_images_slid_window(image, self.image_processor, self.model_config, None, None, 336)
 71 |         else:
 72 |             image_tensor = process_images([image], self.image_processor, self.model_config)[0]
 73 |             
 74 |         return input_ids, image_tensor
 75 | 
 76 |     def __len__(self):
 77 |         return len(self.questions)
 78 | 
 79 | 
 80 | # DataLoader
 81 | def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, args, batch_size=1, num_workers=4):
 82 |     assert batch_size == 1, "batch_size must be 1"
 83 |     dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config, args)
 84 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
 85 |     return data_loader
 86 | 
 87 | 
 88 | def eval_model(args):
 89 |     # Model
 90 |     torch.distributed.init_process_group(
 91 |         backend='nccl',
 92 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
 93 |         rank=int(os.getenv('RANK', '0')),
 94 |     )
 95 | 
 96 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
 97 | 
 98 |     disable_torch_init()
 99 |     model_path = os.path.expanduser(args.model_path)
100 |     model_name = get_model_name_from_path(model_path)
101 |     tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
102 |     tokenizer.pad_token = tokenizer.eos_token
103 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
104 |     # questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
105 |     questions = get_chunk(questions, int(os.getenv('WORLD_SIZE', '1')), torch.distributed.get_rank())
106 |     
107 |     answers_file = os.path.expanduser(args.answers_file)
108 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
109 |     ans_file = open(answers_file, "w")
110 | 
111 |     if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
112 |         args.conv_mode = args.conv_mode + '_mmtag'
113 |         print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
114 | 
115 |     data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config, args)
116 | 
117 |     all_outputs = []
118 |     for (input_ids, image_tensor), line in tqdm(zip(data_loader, questions), total=len(questions)):
119 |         idx = line["question_id"]
120 |         cur_prompt = line["text"]
121 | 
122 |         # stop_str = conv_templates[args.conv_mode].sep if conv_templates[args.conv_mode].sep_style != SeparatorStyle.TWO else conv_templates[args.conv_mode].sep2
123 |         input_ids = input_ids.to(device='cuda', non_blocking=True)
124 |         terminators = [
125 |             tokenizer.convert_tokens_to_ids("<|eot_id|>",)
126 |         ]
127 | 
128 |         with torch.inference_mode():
129 |             output_ids = model.generate(
130 |                 input_ids,
131 |                 images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
132 |                 do_sample=True if args.temperature > 0 else False,
133 |                 temperature=args.temperature,
134 |                 eos_token_id=terminators,
135 |                 top_p=args.top_p,
136 |                 num_beams=args.num_beams,
137 |                 max_new_tokens=1280,
138 |                 use_cache=True)
139 | 
140 |         input_token_len = input_ids.shape[1]
141 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
142 |         if n_diff_input_output > 0:
143 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
144 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
145 |         outputs = outputs.strip()
146 |         print(outputs)
147 | 
148 |         ans_id = shortuuid.uuid()
149 | 
150 |         all_outputs.append({"question_id": idx,
151 |                                    "prompt": cur_prompt,
152 |                                    "text": outputs,
153 |                                    "answer_id": ans_id,
154 |                                    "model_id": model_name,
155 |                                    "metadata": {}})
156 | 
157 |     torch.distributed.barrier()
158 | 
159 |     world_size = torch.distributed.get_world_size()
160 |     merged_outputs = [None for _ in range(world_size)]
161 |     torch.distributed.all_gather_object(merged_outputs, all_outputs)
162 | 
163 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
164 | 
165 |     if torch.distributed.get_rank() == 0:
166 |         for item in merged_outputs:
167 |             ans_file.write(json.dumps(item) + "\n")
168 |     ans_file.close()
169 |     torch.distributed.barrier()
170 | 
171 | if __name__ == "__main__":
172 |     parser = argparse.ArgumentParser()
173 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
174 |     parser.add_argument("--model-base", type=str, default=None)
175 |     parser.add_argument("--image-folder", type=str, default="")
176 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
177 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
178 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
179 |     parser.add_argument("--num-chunks", type=int, default=1)
180 |     parser.add_argument("--chunk-idx", type=int, default=0)
181 |     parser.add_argument("--temperature", type=float, default=0.2)
182 |     parser.add_argument("--top_p", type=float, default=None)
183 |     parser.add_argument("--num_beams", type=int, default=1)
184 |     parser.add_argument("--slide_window", action="store_true")
185 |     args = parser.parse_args()
186 | 
187 |     eval_model(args)
188 | 


--------------------------------------------------------------------------------
/qh360_vl/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/qh360_vl/mm_utils.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | from io import BytesIO
  3 | import base64
  4 | 
  5 | import torch
  6 | from transformers import StoppingCriteria
  7 | from qh360_vl.constants import IMAGE_TOKEN_INDEX
  8 | 
  9 | 
 10 | def load_image_from_base64(image):
 11 |     return Image.open(BytesIO(base64.b64decode(image)))
 12 | 
 13 | 
 14 | def expand2square(pil_img, background_color):
 15 |     width, height = pil_img.size
 16 |     if width == height:
 17 |         return pil_img
 18 |     elif width > height:
 19 |         result = Image.new(pil_img.mode, (width, width), background_color)
 20 |         result.paste(pil_img, (0, (width - height) // 2))
 21 |         return result
 22 |     else:
 23 |         result = Image.new(pil_img.mode, (height, height), background_color)
 24 |         result.paste(pil_img, ((height - width) // 2, 0))
 25 |         return result
 26 | 
 27 | 
 28 | def process_images(images, image_processor, model_cfg):
 29 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
 30 |     new_images = []
 31 |     if image_aspect_ratio == 'pad':
 32 |         for image in images:
 33 |             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
 34 |             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 35 |             new_images.append(image)
 36 |     else:
 37 |         return image_processor(images, return_tensors='pt')['pixel_values']
 38 |     if all(x.shape == new_images[0].shape for x in new_images):
 39 |         new_images = torch.stack(new_images, dim=0)
 40 |     return new_images
 41 | 
 42 | def get_proper_imgsize(pil_img, vit_is):
 43 |     max_w_h = vit_is * 2
 44 |     new_pil_img = pil_img.resize((max_w_h, max_w_h)) 
 45 |     return new_pil_img
 46 | 
 47 | def tensor_crop(tensor_array, left, upper, right, lower):
 48 |     # tensor_array: C * H * W
 49 |     return tensor_array[:, upper:lower, left:right]
 50 | 
 51 | def image_slid_window(image, num_slid_window):
 52 |     # image: tensor, 3 * 336 * 336 or 3 * 672 * 672
 53 |     # image: tensor, 3 * 224 * 224 or 3 * 448 * 448
 54 |     if num_slid_window == 5:
 55 |         image_x2, image_x1 = image[0], image[1]
 56 |         vit_is = image_x1.shape[1]
 57 |         h, w  = image_x2.shape[1],image_x2.shape[2]
 58 |         image0 = tensor_crop(image_x2, 0, 0, vit_is, vit_is)
 59 |         image1 = tensor_crop(image_x2, w-vit_is, 0, w, vit_is)
 60 |         image2 = tensor_crop(image_x2, 0, h-vit_is, vit_is, h)
 61 |         image3 = tensor_crop(image_x2, w-vit_is, h-vit_is, w, h)
 62 |         return torch.stack([image0, image1, image2, image3, image_x1])
 63 |     else:
 64 |         return image
 65 | 
 66 | def process_images_slid_window(image, image_processor, model_cfg, is_maintain_orig_img_token, is_all_img_resize_672, vit_is):
 67 |     vit_is = vit_is # vit_input_size, for simplicity
 68 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
 69 | 
 70 |     num_slid_window = 5
 71 | 
 72 |     if image_aspect_ratio == 'pad':
 73 |         image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
 74 |         image = get_proper_imgsize(image, vit_is)
 75 |         image_x2 = image_processor.preprocess(image, return_tensors='pt', do_resize=False, do_center_crop=False)['pixel_values'][0]
 76 |         image_x1 = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 77 |         image = [image_x2, image_x1]
 78 |     else:
 79 |         image = get_proper_imgsize(image, vit_is)
 80 |         image_x2 = image_processor.preprocess(image, return_tensors='pt', do_resize=False, do_center_crop=False)['pixel_values'][0]
 81 |         image_x1 = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 82 |         image = [image_x2, image_x1]
 83 | 
 84 |     image = image_slid_window(image, num_slid_window)
 85 | 
 86 |     return image
 87 |     
 88 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 89 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
 90 | 
 91 |     def insert_separator(X, sep):
 92 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
 93 | 
 94 |     input_ids = []
 95 |     offset = 0
 96 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
 97 |         offset = 1
 98 |         input_ids.append(prompt_chunks[0][0])
 99 | 
100 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
101 |         input_ids.extend(x[offset:])
102 | 
103 |     if return_tensors is not None:
104 |         if return_tensors == 'pt':
105 |             return torch.tensor(input_ids, dtype=torch.long)
106 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
107 |     return input_ids
108 | 
109 | # def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
110 | #     # prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
111 | #     prompt_chunks = []  # compatible with transformers>=4.32.0
112 | #     for chunk in prompt.split('<image>'):
113 | #         if len(chunk) > 0:
114 | #             prompt_chunks.append(tokenizer(chunk).input_ids)
115 | #         else:
116 | #             prompt_chunks.append([tokenizer.bos_token_id])
117 | 
118 | #     def insert_separator(X, sep):
119 | #         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
120 | 
121 | #     input_ids = []
122 | #     offset = 0
123 | #     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
124 | #         offset = 1
125 | #         input_ids.append(prompt_chunks[0][0])
126 | 
127 | #     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
128 | #         input_ids.extend(x[offset:])
129 | 
130 | #     if return_tensors is not None:
131 | #         if return_tensors == 'pt':
132 | #             return torch.tensor(input_ids, dtype=torch.long)
133 | #         raise ValueError(f'Unsupported tensor type: {return_tensors}')
134 | #     return input_ids
135 | 
136 | def get_model_name_from_path(model_path):
137 |     model_path = model_path.strip("/")
138 |     model_paths = model_path.split("/")
139 |     if model_paths[-1].startswith('checkpoint-'):
140 |         return model_paths[-2] + "_" + model_paths[-1]
141 |     else:
142 |         return model_paths[-1]
143 | 
144 | 
145 | 
146 | 
147 | class KeywordsStoppingCriteria(StoppingCriteria):
148 |     def __init__(self, keywords, tokenizer, input_ids):
149 |         self.keywords = keywords
150 |         self.keyword_ids = []
151 |         self.max_keyword_len = 0
152 |         for keyword in keywords:
153 |             cur_keyword_ids = tokenizer(keyword).input_ids
154 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
155 |                 cur_keyword_ids = cur_keyword_ids[1:]
156 |             if len(cur_keyword_ids) > self.max_keyword_len:
157 |                 self.max_keyword_len = len(cur_keyword_ids)
158 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
159 |         self.tokenizer = tokenizer
160 |         self.start_len = input_ids.shape[1]
161 | 
162 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
163 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
164 |         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
165 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
166 |         for keyword_id in self.keyword_ids:
167 |             if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
168 |                 return True
169 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
170 |         for keyword in self.keywords:
171 |             if keyword in outputs:
172 |                 return True
173 |         return False


--------------------------------------------------------------------------------
/qh360_vl/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.QH360_VL_llama import QH360_VL_LlamaForCausalLM, QH360_VLConfig
2 | 


--------------------------------------------------------------------------------
/qh360_vl/model/builder.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | import os
 17 | import warnings
 18 | import shutil
 19 | 
 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
 21 | import torch
 22 | from qh360_vl.model import *
 23 | from qh360_vl.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 24 | import transformers
 25 | 
 26 | def load_pretrained_model(model_path, model_base, model_name, load_bf16=False, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
 27 |     kwargs = {"device_map": device_map}
 28 | 
 29 |     if load_8bit:
 30 |         kwargs['load_in_8bit'] = True
 31 |     elif load_4bit:
 32 |         kwargs['load_in_4bit'] = True
 33 |         kwargs['quantization_config'] = BitsAndBytesConfig(
 34 |             load_in_4bit=True,
 35 |             bnb_4bit_compute_dtype=torch.float16,
 36 |             bnb_4bit_use_double_quant=True,
 37 |             bnb_4bit_quant_type='nf4'
 38 |         )
 39 |     elif load_bf16:
 40 |         kwargs['torch_dtype'] = torch.bfloat16
 41 |     else:
 42 |         kwargs['torch_dtype'] = torch.float16
 43 | 
 44 | 
 45 |     if '360vl' in model_name.lower():
 46 |         # Load qh_360vl model
 47 |         if 'lora' in model_name.lower() and model_base is None:
 48 |             warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/qh_360vl#launch-a-model-worker-lora-weights-unmerged.')
 49 |         if 'lora' in model_name.lower() and model_base is not None:
 50 |             lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
 51 |             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
 52 |             print('Loading qh_360vl from base model...')
 53 |             model = QH360_VL_LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 54 |             token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
 55 |             if model.lm_head.weight.shape[0] != token_num:
 56 |                 model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 57 |                 model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 58 | 
 59 |             print('Loading additional qh_360vl weights...')
 60 |             if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
 61 |                 non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
 62 |             else:
 63 |                 # this is probably from HF Hub
 64 |                 from huggingface_hub import hf_hub_download
 65 |                 def load_from_hf(repo_id, filename, subfolder=None):
 66 |                     cache_file = hf_hub_download(
 67 |                         repo_id=repo_id,
 68 |                         filename=filename,
 69 |                         subfolder=subfolder)
 70 |                     return torch.load(cache_file, map_location='cpu')
 71 |                 non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
 72 |             non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
 73 |             if any(k.startswith('model.model.') for k in non_lora_trainables):
 74 |                 non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
 75 |             model.load_state_dict(non_lora_trainables, strict=False)
 76 | 
 77 |             from peft import PeftModel
 78 |             print('Loading LoRA weights...')
 79 |             model = PeftModel.from_pretrained(model, model_path)
 80 |             print('Merging LoRA weights...')
 81 |             model = model.merge_and_unload()
 82 |             print('Model is loaded...')
 83 |         elif model_base is not None:
 84 |             # this may be mm projector only
 85 |             print('Loading qh_360vl from base model...')
 86 |             if 'mpt' in model_name.lower():
 87 |                 if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
 88 |                     shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
 89 |                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
 90 |                 cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 91 |                 model = LlavaMPTForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
 92 |             else:
 93 |                 tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
 94 |                 cfg_pretrained = AutoConfig.from_pretrained(model_path)
 95 |                 model = QH360_VL_LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
 96 | 
 97 |             mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
 98 |             mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
 99 |             model.load_state_dict(mm_projector_weights, strict=False)
100 |         else:
101 |             if 'mpt' in model_name.lower():
102 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
103 |                 model = LlavaMPTForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
104 |             else:
105 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
106 |                 model = QH360_VL_LlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
107 |     else:
108 |         # Load language model
109 |         if model_base is not None:
110 |             # PEFT model
111 |             from peft import PeftModel
112 |             tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
113 |             model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
114 |             print(f"Loading LoRA weights from {model_path}")
115 |             model = PeftModel.from_pretrained(model, model_path)
116 |             print(f"Merging weights")
117 |             model = model.merge_and_unload()
118 |             print('Convert to FP16...')
119 |             model.to(torch.float16)
120 |         else:
121 |             use_fast = False
122 |             if 'mpt' in model_name.lower():
123 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
124 |                 model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
125 |             else:
126 |                 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
127 |                 model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
128 | 
129 |     image_processor = None
130 |     
131 |     if '360vl' in model_name.lower():
132 |         mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
133 |         mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
134 |         if mm_use_im_patch_token:
135 |             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
136 |         if mm_use_im_start_end:
137 |             tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
138 |         model.resize_token_embeddings(len(tokenizer))
139 | 
140 |         vision_tower = model.get_vision_tower()
141 |         if not vision_tower.is_loaded:
142 |             vision_tower.load_model()
143 | 
144 |         vision_tower.to(device=device, dtype=torch.float16)
145 |         image_processor = vision_tower.image_processor
146 | 
147 |     if hasattr(model.config, "max_sequence_length"):
148 |         context_len = model.config.max_sequence_length
149 |     else:
150 |         context_len = 2048
151 | 
152 |     return tokenizer, model, image_processor, context_len
153 | 


--------------------------------------------------------------------------------
/qh360_vl/model/language_model/QH360_VL_llama.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import List, Optional, Tuple, Union
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | 
 21 | from torch.nn import CrossEntropyLoss
 22 | 
 23 | from transformers import AutoConfig, AutoModelForCausalLM, \
 24 |                          LlamaConfig, LlamaModel, LlamaForCausalLM
 25 | 
 26 | from transformers.modeling_outputs import CausalLMOutputWithPast
 27 | 
 28 | from ..QH360_VL_arch_cc import QH360_VL_MetaModel, QH360_VL_MetaForCausalLM
 29 | 
 30 | 
 31 | class QH360_VLConfig(LlamaConfig):
 32 |     model_type = "QH_360VL"
 33 | 
 34 | 
 35 | class QH360_VL_LlamaModel(QH360_VL_MetaModel, LlamaModel):
 36 |     config_class = QH360_VLConfig
 37 | 
 38 |     def __init__(self, config: LlamaConfig):
 39 |         super(QH360_VL_LlamaModel, self).__init__(config)
 40 | 
 41 | 
 42 | class QH360_VL_LlamaForCausalLM(LlamaForCausalLM, QH360_VL_MetaForCausalLM):
 43 |     config_class = QH360_VLConfig
 44 | 
 45 |     def __init__(self, config):
 46 |         super(LlamaForCausalLM, self).__init__(config)
 47 |         config._attn_implementation = "flash_attention_2"
 48 |         self.model = QH360_VL_LlamaModel(config)
 49 | 
 50 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 51 | 
 52 |         # Initialize weights and apply final processing
 53 |         self.post_init()
 54 | 
 55 |     def get_model(self):
 56 |         return self.model
 57 | 
 58 |     def forward(
 59 |         self,
 60 |         input_ids: torch.LongTensor = None,
 61 |         attention_mask: Optional[torch.Tensor] = None,
 62 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 63 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 64 |         labels: Optional[torch.LongTensor] = None,
 65 |         use_cache: Optional[bool] = None,
 66 |         output_attentions: Optional[bool] = None,
 67 |         output_hidden_states: Optional[bool] = None,
 68 |         images: Optional[torch.FloatTensor] = None,
 69 |         return_dict: Optional[bool] = None,
 70 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 71 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 72 |         output_hidden_states = (
 73 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 74 |         )
 75 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 76 | 
 77 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
 78 | 
 79 |         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
 80 |         outputs = self.model(
 81 |             input_ids=input_ids,
 82 |             attention_mask=attention_mask,
 83 |             past_key_values=past_key_values,
 84 |             inputs_embeds=inputs_embeds,
 85 |             use_cache=use_cache,
 86 |             output_attentions=output_attentions,
 87 |             output_hidden_states=output_hidden_states,
 88 |             return_dict=return_dict
 89 |         )
 90 | 
 91 |         hidden_states = outputs[0]
 92 |         logits = self.lm_head(hidden_states)
 93 | 
 94 |         loss = None
 95 |         if labels is not None:
 96 |             # Shift so that tokens < n predict n
 97 |             shift_logits = logits[..., :-1, :].contiguous()
 98 |             shift_labels = labels[..., 1:].contiguous()
 99 |             # Flatten the tokens
100 |             loss_fct = CrossEntropyLoss()
101 |             shift_logits = shift_logits.view(-1, self.config.vocab_size)
102 |             shift_labels = shift_labels.view(-1)
103 |             # Enable model/pipeline parallelism
104 |             shift_labels = shift_labels.to(shift_logits.device)
105 |             loss = loss_fct(shift_logits, shift_labels)
106 | 
107 |         if not return_dict:
108 |             output = (logits,) + outputs[1:]
109 |             return (loss,) + output if loss is not None else output
110 | 
111 |         return CausalLMOutputWithPast(
112 |             loss=loss,
113 |             logits=logits,
114 |             past_key_values=outputs.past_key_values,
115 |             hidden_states=outputs.hidden_states,
116 |             attentions=outputs.attentions,
117 |         )
118 | 
119 |     def prepare_inputs_for_generation(
120 |         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
121 |     ):
122 |         if past_key_values:
123 |             input_ids = input_ids[:, -1:]
124 | 
125 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
126 |         if inputs_embeds is not None and past_key_values is None:
127 |             model_inputs = {"inputs_embeds": inputs_embeds}
128 |         else:
129 |             model_inputs = {"input_ids": input_ids}
130 | 
131 |         model_inputs.update(
132 |             {
133 |                 "past_key_values": past_key_values,
134 |                 "use_cache": kwargs.get("use_cache"),
135 |                 "attention_mask": attention_mask,
136 |                 "images": kwargs.get("images", None),
137 |             }
138 |         )
139 |         return model_inputs
140 | 
141 | AutoConfig.register("QH_360VL", QH360_VLConfig)
142 | AutoModelForCausalLM.register(QH360_VLConfig, QH360_VL_LlamaForCausalLM)
143 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     
 9 | 
10 |     # if 'clip' in vision_tower.lower():
11 |     #     return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12 |     
13 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
14 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
15 | 
16 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
17 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | 
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 | 
17 |         if not delay_load:
18 |             self.load_model()
19 |         else:
20 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
21 | 
22 |     def load_model(self):
23 |         self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
24 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
25 |         self.vision_tower.requires_grad_(False)
26 | 
27 |         self.is_loaded = True
28 | 
29 |     def feature_select(self, image_forward_outs):
30 |         image_features = image_forward_outs.hidden_states[self.select_layer]
31 |         if self.select_feature == 'patch':
32 |             image_features = image_features[:, 1:]
33 |         elif self.select_feature == 'cls_patch':
34 |             image_features = image_features
35 |         else:
36 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
37 |         return image_features
38 | 
39 |     @torch.no_grad()
40 |     def forward(self, images):
41 |         if type(images) is list:
42 |             image_features = []
43 |             for image in images:
44 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
45 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
46 |                 image_features.append(image_feature)
47 |         else:
48 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
49 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
50 | 
51 |         return image_features
52 | 
53 |     @property
54 |     def dummy_feature(self):
55 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
56 | 
57 |     @property
58 |     def dtype(self):
59 |         return self.vision_tower.dtype
60 | 
61 |     @property
62 |     def device(self):
63 |         return self.vision_tower.device
64 | 
65 |     @property
66 |     def config(self):
67 |         if self.is_loaded:
68 |             return self.vision_tower.config
69 |         else:
70 |             return self.cfg_only
71 | 
72 |     @property
73 |     def hidden_size(self):
74 |         return self.config.hidden_size
75 | 
76 |     @property
77 |     def num_patches(self):
78 |         return (self.config.image_size // self.config.patch_size) ** 2
79 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | from .projectors import CAbstractor
 5 | from transformers import PretrainedConfig
 6 | from .configuration_honeybee import HoneybeeConfig,HoneybeeVisualProjectorConfig
 7 | import torch.nn.functional as F
 8 | 
 9 | class IdentityMap(nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 |     @property
17 |     def config(self):
18 |         return {"mm_projector_type": 'identity'}
19 | 
20 | 
21 | class SimpleResBlock(nn.Module):
22 |     def __init__(self, channels):
23 |         super().__init__()
24 |         self.pre_norm = nn.LayerNorm(channels)
25 | 
26 |         self.proj = nn.Sequential(
27 |             nn.Linear(channels, channels),
28 |             nn.GELU(),
29 |             nn.Linear(channels, channels)
30 |         )
31 |     def forward(self, x):
32 |         x = self.pre_norm(x)
33 |         return x + self.proj(x)
34 | 
35 | 
36 | def build_honeybee_projector(config, projector_type, num_tokens,lm_hidden_size):
37 |     """Build projector (abstractor) and query_tokens (optionally for resampler)"""
38 |     proj_config = config
39 |     proj_type = projector_type
40 |     num_tokens = num_tokens
41 |     output_hidden_size = lm_hidden_size  # LM hidden size
42 | 
43 |     abstractor = {
44 |         "c-abs": CAbstractor,
45 |     }[
46 |         proj_type
47 |     ](proj_config, num_tokens, output_hidden_size)
48 |     return abstractor
49 | 
50 | 
51 | def build_vision_projector(config, delay_load=False, **kwargs):
52 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
53 | 
54 |     if projector_type == 'linear':
55 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
56 | 
57 |     if projector_type == 'c-abs':
58 | 
59 |         local_config_path = config.mm_projector_config
60 |         honeybee_config = HoneybeeVisualProjectorConfig.from_pretrained(local_config_path)
61 | 
62 |         num_tokens = config.mm_num_tokens
63 | 
64 |         lm_hidden_size = config.hidden_size
65 | 
66 |         abstractor = build_honeybee_projector(honeybee_config,projector_type,num_tokens,lm_hidden_size)
67 |         return abstractor
68 | 
69 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
70 |     if mlp_gelu_match:
71 |         mlp_depth = int(mlp_gelu_match.group(1))
72 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
73 |         for _ in range(1, mlp_depth):
74 |             modules.append(nn.GELU())
75 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
76 |         return nn.Sequential(*modules)
77 | 
78 |     if projector_type == 'identity':
79 |         return IdentityMap()
80 | 
81 |     raise ValueError(f'Unknown projector type: {projector_type}')
82 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/pipeline/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import yaml
 4 | from omegaconf import DictConfig, OmegaConf
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     __setattr__ = dict.__setitem__
 9 | 
10 |     def __getattribute__(self, item):
11 |         if item in self:
12 |             return self[item]
13 |         else:
14 |             return super().__getattribute__(item)
15 | 
16 |     @classmethod
17 |     def from_nested_dicts(cls, data):
18 |         if not isinstance(data, dict):
19 |             return data
20 |         else:
21 |             return cls({key: cls.from_nested_dicts(data[key]) for key in data})
22 | 
23 |     def asdict(self):
24 |         def _asdict(data):
25 |             if not isinstance(data, dict):
26 |                 return data
27 |             else:
28 |                 return {key: _asdict(data[key]) for key in data}
29 | 
30 |         return _asdict(self)
31 | 
32 | 
33 | def save_config(cfg):
34 |     """Save config to `config.output_dir/exp_config.yaml`.
35 |     """
36 |     output_dir = cfg.output_dir
37 |     if isinstance(cfg, AttrDict):
38 |         cfg = cfg.asdict()  # AttrDict does not work with OmegaConf.to_yaml
39 | 
40 |     out_conf_dir = os.path.dirname(output_dir)
41 |     os.makedirs(out_conf_dir, exist_ok=True)
42 |     with open(os.path.join(output_dir, "exp_config.yaml"), "w") as fout:
43 |         fout.write(OmegaConf.to_yaml(cfg) + "\n")
44 | 
45 | 
46 | def set_config(cfg: DictConfig, save: bool = False) -> AttrDict:
47 |     # convert DictConfig to AttrDict
48 |     # - it is slow to access DictConfig
49 |     # - DictConfig makes an unresolved error:
50 |     #   `RuntimeError: DataLoader worker (pid 7103) is killed by signal: Aborted`.
51 | 
52 |     OmegaConf.resolve(cfg)
53 | 
54 |     if save:
55 |         # config loaded by hydra is saved to <output_dir>/exp_config.yaml
56 |         save_config(cfg)
57 | 
58 |     cfg = OmegaConf.to_container(cfg)
59 |     cfg = AttrDict.from_nested_dicts(cfg)
60 |     return cfg
61 | 
62 | 
63 | def load_config(cfg_path: str) -> AttrDict:
64 |     with open(cfg_path, "r") as fin:
65 |         cfg = yaml.load(fin, Loader=yaml.FullLoader)
66 |     cfg = AttrDict.from_nested_dicts(cfg)
67 |     return cfg
68 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/pipeline/data_utils/special_tokens.py:
--------------------------------------------------------------------------------
1 | SYSTEM = "The following is a conversation between a curious human and AI assistant."
2 | SYSTEM_DETAIL = "The assistant gives helpful, detailed, and polite answers to the user's questions."
3 | ORG_SYSTEM = SYSTEM + " " + SYSTEM_DETAIL
4 | IMAGE = "<image>"
5 | _MEDIA_TOKENS = {"image": [IMAGE]}  # Special tokens used in this codebase.
6 | # Role pattern tokens
7 | HUMAN = "Human: "
8 | AI = "AI: "
9 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/pipeline/data_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | def idx2option(idx: int, style="upper", deco="dot"):
 4 |     """
 5 |     idx: [0, N-1]
 6 |     style: upper, lower, num
 7 |     deco: None, paren, dot, rparen
 8 |     """
 9 |     idx = {
10 |         "upper": chr(ord("A") + idx),
11 |         "lower": chr(ord("a") + idx),
12 |         "num": f"{idx + 1}",
13 |     }[style]
14 | 
15 |     idx = {
16 |         None: "{idx}",
17 |         "paren": "({idx})",
18 |         "dot": "{idx}.",
19 |         "rparen": "{idx})",
20 |     }[deco].format(idx=idx)
21 | 
22 |     return idx
23 | 
24 | 
25 | def optionize(
26 |     options: list[str],
27 |     answer_idx: int,
28 |     shuffle=False,
29 |     aug_idx_style=False,
30 |     include_answer_str=False,
31 |     sep="\n"
32 | ) -> (str, str):
33 |     """Convert options (list of str) to option string.
34 |     This process also includes:
35 |     - option shuffling
36 |     - index augmentation
37 |     Args:
38 |         options (list[str])
39 |         answer_idx (int)
40 |         shuffle (bool): shuffle options
41 |         aug_idx_style (bool): randomly choose index style
42 |             Aug examples: (1) / 1. / (A) / A.
43 |         include_answer_str (bool): include answer string
44 |             False: A
45 |             True: A. {answer}
46 |     Return:
47 |         (option_str, answer_str)
48 |     """
49 |     if isinstance(options, str):
50 |         # already optionized
51 |         return options
52 | 
53 |     answer = options[answer_idx]
54 |     if shuffle:
55 |         random.shuffle(options)
56 |         answer_idx = options.index(answer)
57 | 
58 |     if not aug_idx_style:
59 |         style = "upper"
60 |         deco = "dot"
61 |     else:
62 |         style = random.choice(["upper", "lower", "num"])
63 |         deco = random.choice(["paren", "dot", "rparen"])
64 | 
65 |     indices = [idx2option(i, style=style, deco=deco) for i in range(len(options))]
66 |     answer_str = idx2option(answer_idx, style=style, deco=None)
67 |     if include_answer_str:
68 |         answer_str = f"{answer_str}. {answer}"
69 | 
70 |     options_with_index = [
71 |         f"{idx} {option}"
72 |         for idx, option in zip(indices, options)
73 |     ]
74 |     option_str = sep.join(options_with_index)
75 |     return option_str, answer_str
76 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/pipeline/interface.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | from peft import PeftConfig, PeftModel
  5 | from PIL import Image
  6 | from transformers import AutoTokenizer
  7 | from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
  8 | from pathlib import Path
  9 | 
 10 | from honeybee.modeling_honeybee import HoneybeeForConditionalGeneration
 11 | from honeybee.processing_honeybee import HoneybeeImageProcessor, HoneybeeProcessor
 12 | 
 13 | 
 14 | def load_model(pretrained_ckpt, use_bf16=True, load_in_8bit=False):
 15 |     """Model loader.
 16 | 
 17 |     Args:
 18 |         pretrained_ckpt (string): The path to pre-trained checkpoint.
 19 |         use_bf16 (bool, optional): Whether to use bfloat16 to load the model. (Default: True)
 20 |         load_in_8bit(bool, optional): Flag to load model in 8it. (Default: False)
 21 | 
 22 |     Returns:
 23 |         model: Honeybee Model
 24 |     """
 25 | 
 26 |     # we check whether the model is trained using PEFT
 27 |     # by checking existance of 'adapter_config.json' is in pretrained_ckpt folder.
 28 |     is_peft = os.path.exists(os.path.join(pretrained_ckpt, "adapter_config.json"))
 29 | 
 30 |     if is_peft:
 31 |         # when using checkpoints trained using PEFT (by us)
 32 |         config = PeftConfig.from_pretrained(pretrained_ckpt)
 33 |         if config.base_model_name_or_path == "":
 34 |             # when pre-training, there is no definition of base_model_name_or_path
 35 |             # but, we saved the base model at <parent_path_of_pretrained_ckpt>/base
 36 |             config.base_model_name_or_path = os.path.join(os.path.dirname(pretrained_ckpt), "base")
 37 | 
 38 |         base_model = HoneybeeForConditionalGeneration.from_pretrained(
 39 |             config.base_model_name_or_path,
 40 |             load_in_8bit=load_in_8bit,
 41 |             torch_dtype=torch.bfloat16 if use_bf16 else torch.half,
 42 |             # avoiding RuntimeError: Expected all tensors to be on the same device
 43 |             device_map={"": int(os.environ.get("LOCAL_RANK", 0))},
 44 |         )
 45 |         model = PeftModel.from_pretrained(
 46 |             base_model,
 47 |             pretrained_ckpt,
 48 |             is_trainable=True,
 49 |             torch_dtype=torch.bfloat16 if use_bf16 else torch.half,
 50 |         )
 51 |     else:
 52 |         # when using original mllm checkpoints
 53 |         model = HoneybeeForConditionalGeneration.from_pretrained(
 54 |             pretrained_ckpt,
 55 |             torch_dtype=torch.bfloat16 if use_bf16 else torch.half,
 56 |         )
 57 |     return model
 58 | 
 59 | 
 60 | def get_model(pretrained_ckpt, use_bf16=True, load_in_8bit=False):
 61 |     """Model Provider with tokenizer and processor.
 62 | 
 63 |     Args:
 64 |         pretrained_ckpt (string): The path to pre-trained checkpoint.
 65 |         use_bf16 (bool, optional): Whether to use bfloat16 to load the model. (Default: True)
 66 |         load_in_8bit(bool, optional): Flag to load model in 8it. (Default: False)
 67 | 
 68 |     Returns:
 69 |         model: Honeybee Model
 70 |         tokenizer: Honeybee (Llama) text tokenizer
 71 |         processor: Honeybee processor (including text and image)
 72 |     """
 73 |     # Load model where base_ckpt is different when the target model is trained by PEFT
 74 |     model = load_model(pretrained_ckpt, use_bf16, load_in_8bit)
 75 | 
 76 |     image_size = model.config.vision_config.image_size
 77 |     num_query_tokens = model.config.num_query_tokens
 78 |     num_eos_tokens = getattr(model.config.visual_projector_config, "num_eos_tokens", 1)
 79 |     num_visual_tokens = num_query_tokens + num_eos_tokens
 80 | 
 81 |     # Build processor
 82 |     image_processor = HoneybeeImageProcessor(
 83 |         size=image_size,
 84 |         crop_size=image_size,
 85 |         image_mean=OPENAI_CLIP_MEAN,
 86 |         image_std=OPENAI_CLIP_STD,
 87 |     )
 88 |     # Load tokenizer (LlamaTokenizer)
 89 |     tokenizer_ckpt = model.config.lm_config.pretrained_tokenizer_name_or_path
 90 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt, use_fast=False)
 91 |     if tokenizer.pad_token is None:
 92 |         tokenizer.pad_token = tokenizer.unk_token
 93 |     processor = HoneybeeProcessor(
 94 |         image_processor, tokenizer, num_visual_token=num_visual_tokens
 95 |     )
 96 | 
 97 |     return model, tokenizer, processor
 98 | 
 99 | 
100 | def do_generate(
101 |     prompts, image_list, model, tokenizer, processor, use_bf16=False, **generate_kwargs
102 | ):
103 |     """The interface for generation
104 | 
105 |     Args:
106 |         prompts (List[str]): The prompt text
107 |         image_list (List[str]): Paths of images
108 |         model (HoneybeeForConditionalGeneration): HoneybeeForConditionalGeneration
109 |         tokenizer (AutoTokenizer): AutoTokenizer
110 |         processor (HoneybeeProcessor): HoneybeeProcessor
111 |         use_bf16 (bool, optional): Whether to use bfloat16. Defaults to False.
112 | 
113 |     Returns:
114 |         sentence (str): Generated sentence.
115 |     """
116 |     if image_list:
117 |         images = [Image.open(_) for _ in image_list]
118 |     else:
119 |         images = None
120 |     inputs = processor(text=prompts, images=images)
121 |     inputs = {k: v.bfloat16() if v.dtype == torch.float else v for k, v in inputs.items()}
122 |     inputs = {k: v.to(model.device) for k, v in inputs.items()}
123 |     with torch.no_grad():
124 |         res = model.generate(**inputs, **generate_kwargs)
125 |     sentence = tokenizer.decode(res.tolist()[0], skip_special_tokens=True)
126 |     return sentence
127 | 


--------------------------------------------------------------------------------
/qh360_vl/model/multimodal_projector/projectors.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from einops import rearrange
  6 | from timm.models.layers import LayerNorm, LayerNorm2d
  7 | from timm.models.regnet import RegStage
  8 | 
  9 | 
 10 | from .configuration_honeybee import HoneybeeVisualProjectorConfig
 11 | from torch.nn import functional as F
 12 | import math
 13 | 
 14 | def build_pos_embeds(
 15 |     config: HoneybeeVisualProjectorConfig, num_input_tokens: int, vision_hidden_size: int
 16 | ):
 17 |     # pos emb
 18 |     # true
 19 |     if config.pos_emb:
 20 |         pos_emb = torch.nn.Parameter(torch.zeros(1, num_input_tokens, vision_hidden_size))
 21 |         nn.init.trunc_normal_(pos_emb, mean=0.0, std=0.02)
 22 |     else:
 23 |         pos_emb = None
 24 | 
 25 |     return pos_emb
 26 | 
 27 | 
 28 | def build_eos_tokens(config: HoneybeeVisualProjectorConfig, output_hidden_size: int):
 29 |     # think tokens
 30 |     num_eos_tokens = config.num_eos_tokens
 31 |     # 0
 32 |     if num_eos_tokens:
 33 |         eos_tokens = torch.nn.Parameter(torch.randn(1, num_eos_tokens, output_hidden_size))
 34 |         nn.init.trunc_normal_(eos_tokens, mean=0.0, std=config.initializer_range)
 35 |     else:
 36 |         eos_tokens = None
 37 | 
 38 |     return eos_tokens
 39 | 
 40 | 
 41 | def build_prenorm(config: HoneybeeVisualProjectorConfig):
 42 |     # false
 43 |     if config.prenorm:
 44 |         prenorm = LayerNorm(config.encoder_hidden_size)
 45 |     else:
 46 |         prenorm = None
 47 |     return prenorm
 48 | 
 49 | 
 50 | def build_mlp(depth, hidden_size, output_hidden_size):
 51 |     layers = [nn.Linear(hidden_size, output_hidden_size)]
 52 |     for _ in range(1, depth):
 53 |         layers.append(nn.SiLU())
 54 |         layers.append(nn.Linear(output_hidden_size, output_hidden_size))
 55 |     return nn.Sequential(*layers)
 56 | 
 57 | def get_abs_pos(abs_pos, tgt_size):
 58 |     # abs_pos: L, C
 59 |     # tgt_size: M
 60 |     # return: M, C
 61 |     # 16,24
 62 |     src_size = int(math.sqrt(abs_pos.size(1)))
 63 |     # 32,48
 64 |     tgt_size = int(math.sqrt(tgt_size))
 65 |     dtype = abs_pos.dtype
 66 | 
 67 |     if src_size != tgt_size:
 68 |         return F.interpolate(
 69 |             abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
 70 |             size=(tgt_size, tgt_size),
 71 |             mode="bicubic",
 72 |             align_corners=False,
 73 |         ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
 74 |     else:
 75 |         return abs_pos
 76 | 
 77 | 
 78 | class Projector(nn.Module):
 79 |     """Base projector class"""
 80 | 
 81 |     def __init__(
 82 |         self,
 83 |         config: HoneybeeVisualProjectorConfig,
 84 |         num_input_tokens: int,
 85 |         output_hidden_size: int,
 86 |     ):
 87 |         super().__init__()
 88 |         self.config = config
 89 |         self.num_input_tokens = num_input_tokens
 90 |         self.output_hidden_size = output_hidden_size
 91 | 
 92 |         # think tokens
 93 |         self.eos_tokens = build_eos_tokens(config, output_hidden_size)
 94 | 
 95 |         # pos emb
 96 |         self.pos_emb = build_pos_embeds(config, num_input_tokens, config.encoder_hidden_size)
 97 | 
 98 |         self.prenorm = build_prenorm(config)
 99 | 
100 |         self.build_net()
101 | 
102 |     def build_net(self):
103 |         raise NotImplementedError()
104 | 
105 |     def _forward(self, x):
106 |         raise NotImplementedError()
107 | 
108 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
109 |         """
110 |         Args:
111 |             x: (B, L, encoder_hidden_size) tensor from the visual backbone (CLIP visual encoder), including cls token.
112 |         """
113 |         if self.prenorm is not None:
114 |             x = self.prenorm(x)
115 | 
116 |         if self.pos_emb is not None:
117 |             # self.pos_emb = self.pos_emb[:,1:]
118 |             pos_emb = get_abs_pos(self.pos_emb[:,1:], x.size(1))
119 |             pos_emb = pos_emb.to(device=x.device)
120 |             x += pos_emb
121 | 
122 |         x = self._forward(x)  # (B, L, output_hidden_size)
123 | 
124 |         B = x.size(0)
125 |         if self.eos_tokens is not None:
126 |             x = torch.cat([x, self.eos_tokens.expand(B, -1, -1)], dim=1)
127 |         return x
128 | 
129 | 
130 | class ConvProjector(Projector):
131 |     def _forward(self, x):
132 |         # x: [B, L, dim]
133 |         # x = x[:, 1:]  # drop cls token and 2d forward
134 | 
135 |         hw = int(x.size(1) ** 0.5)
136 |         x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)
137 |         x = self.net(x)
138 |         x = rearrange(x, "b d h w -> b (h w) d")
139 |         x = self.readout(x)
140 | 
141 |         return x
142 | 
143 | 
144 | class CAbstractor(ConvProjector):
145 |     """C-Abstractor"""
146 |     def build_net(self):
147 |         encoder_hidden_size = self.config.encoder_hidden_size
148 |         hidden_size = self.config.hidden_size
149 |         output_hidden_size = self.output_hidden_size
150 |         depth = self.config.depth
151 |         mlp_depth = self.config.mlp_depth
152 | 
153 |         n_queries = self.config.num_queries
154 |         assert (n_queries ** 0.5).is_integer(), "n_queries must be square number"
155 |         hw = int(n_queries ** 0.5)
156 | 
157 |         # RegBlock = ResBlock + SE
158 |         RegBlock = partial(
159 |             RegStage,
160 |             stride=1,
161 |             dilation=1,
162 |             act_layer=nn.SiLU,
163 |             norm_layer=LayerNorm2d,
164 |         )
165 | 
166 |         s1 = RegBlock(
167 |             depth,
168 |             encoder_hidden_size,
169 |             hidden_size,
170 |         )
171 |         sampler = nn.AdaptiveAvgPool2d((hw, hw))
172 |         s2 = RegBlock(
173 |             depth,
174 |             hidden_size,
175 |             hidden_size,
176 |         )
177 | 
178 |         self.net = nn.Sequential(s1, sampler, s2)
179 |         self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
180 | 
181 | 


--------------------------------------------------------------------------------
/qh360_vl/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/qh360_vl/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/__init__.py


--------------------------------------------------------------------------------
/qh360_vl/serve/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | 
  4 | from qh360_vl.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  5 | from qh360_vl.conversation import conv_templates, SeparatorStyle
  6 | from qh360_vl.model.builder import load_pretrained_model
  7 | from qh360_vl.utils import disable_torch_init
  8 | from qh360_vl.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
  9 | 
 10 | from PIL import Image
 11 | 
 12 | import requests
 13 | from PIL import Image
 14 | from io import BytesIO
 15 | from transformers import TextStreamer
 16 | 
 17 | 
 18 | def load_image(image_file):
 19 |     if image_file.startswith('http://') or image_file.startswith('https://'):
 20 |         response = requests.get(image_file)
 21 |         image = Image.open(BytesIO(response.content)).convert('RGB')
 22 |     else:
 23 |         image = Image.open(image_file).convert('RGB')
 24 |     return image
 25 | 
 26 | 
 27 | def main(args):
 28 |     # Model
 29 |     disable_torch_init()
 30 | 
 31 |     model_name = get_model_name_from_path(args.model_path)
 32 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
 33 | 
 34 |     if 'llama-2' in model_name.lower():
 35 |         conv_mode = "llava_llama_2"
 36 |     elif "v1" in model_name.lower():
 37 |         conv_mode = "llava_v1"
 38 |     elif "mpt" in model_name.lower():
 39 |         conv_mode = "mpt"
 40 |     else:
 41 |         conv_mode = "llava_v0"
 42 | 
 43 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 44 |         print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
 45 |     else:
 46 |         args.conv_mode = conv_mode
 47 | 
 48 |     conv = conv_templates[args.conv_mode].copy()
 49 |     if "mpt" in model_name.lower():
 50 |         roles = ('user', 'assistant')
 51 |     else:
 52 |         roles = conv.roles
 53 | 
 54 |     image = load_image(args.image_file)
 55 |     # Similar operation in model_worker.py
 56 |     image_tensor = process_images([image], image_processor, args)
 57 |     if type(image_tensor) is list:
 58 |         image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
 59 |     else:
 60 |         image_tensor = image_tensor.to(model.device, dtype=torch.float16)
 61 | 
 62 |     while True:
 63 |         try:
 64 |             inp = input(f"{roles[0]}: ")
 65 |         except EOFError:
 66 |             inp = ""
 67 |         if not inp:
 68 |             print("exit...")
 69 |             break
 70 | 
 71 |         print(f"{roles[1]}: ", end="")
 72 | 
 73 |         if image is not None:
 74 |             # first message
 75 |             if model.config.mm_use_im_start_end:
 76 |                 inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
 77 |             else:
 78 |                 inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
 79 |             conv.append_message(conv.roles[0], inp)
 80 |             image = None
 81 |         else:
 82 |             # later messages
 83 |             conv.append_message(conv.roles[0], inp)
 84 |         conv.append_message(conv.roles[1], None)
 85 |         prompt = conv.get_prompt()
 86 | 
 87 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 88 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 89 |         keywords = [stop_str]
 90 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 91 |         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 92 | 
 93 |         with torch.inference_mode():
 94 |             output_ids = model.generate(
 95 |                 input_ids,
 96 |                 images=image_tensor,
 97 |                 do_sample=True,
 98 |                 temperature=args.temperature,
 99 |                 max_new_tokens=args.max_new_tokens,
100 |                 streamer=streamer,
101 |                 use_cache=True,
102 |                 stopping_criteria=[stopping_criteria])
103 | 
104 |         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
105 |         conv.messages[-1][-1] = outputs
106 | 
107 |         if args.debug:
108 |             print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
114 |     parser.add_argument("--model-base", type=str, default=None)
115 |     parser.add_argument("--image-file", type=str, required=True)
116 |     parser.add_argument("--device", type=str, default="cuda")
117 |     parser.add_argument("--conv-mode", type=str, default=None)
118 |     parser.add_argument("--temperature", type=float, default=0.2)
119 |     parser.add_argument("--max-new-tokens", type=int, default=512)
120 |     parser.add_argument("--load-8bit", action="store_true")
121 |     parser.add_argument("--load-4bit", action="store_true")
122 |     parser.add_argument("--debug", action="store_true")
123 |     parser.add_argument("--image-aspect-ratio", type=str, default='pad')
124 |     args = parser.parse_args()
125 |     main(args)
126 | 


--------------------------------------------------------------------------------
/qh360_vl/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/qh360_vl/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360CVGroup/360VL/ad6a11c15d41cfea2fe487e0d2c88feb138546af/qh360_vl/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/qh360_vl/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/qh360_vl/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from qh360_vl.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/qh360_vl/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from qh360_vl.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True)
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 | 
53 |     return results
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 |     eval_single(args.result_file, eval_only_type='video')
63 | 
64 |     with open(args.result_upload_file, 'w') as fp:
65 |         for question in data['questions']:
66 |             qid = question['question_id']
67 |             if qid in results:
68 |                 result = results[qid]
69 |             else:
70 |                 result = results[int(qid)]
71 |             fp.write(json.dumps({
72 |                 'question_id': qid,
73 |                 'prediction': result['text']
74 |             }) + '\n')
75 | 


--------------------------------------------------------------------------------
/scripts/convert_sqa_to_llava.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fire
 4 | import re
 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
 6 | 
 7 | 
 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
 9 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
11 | 
12 |     split_problems = build_prompt_chatbot(
13 |         problems, split_indices, prompt_format,
14 |         use_caption=False, is_test=False)
15 | 
16 |     target_format = []
17 |     for prob_id, (input, output) in split_problems.items():
18 |         if input.startswith('Question: '):
19 |             input = input.replace('Question: ', '')
20 |         if output.startswith('Answer: '):
21 |             output = output.replace('Answer: ', '')
22 | 
23 |         raw_prob_data = problems[prob_id]
24 |         if raw_prob_data['image'] is None:
25 |             target_format.append({
26 |                 "id": prob_id,
27 |                 "conversations": [
28 |                     {'from': 'human', 'value': f"{input}"},
29 |                     {'from': 'gpt', 'value': f"{output}"},
30 |                 ],
31 |             })
32 | 
33 |         else:
34 |             target_format.append({
35 |                 "id": prob_id,
36 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
37 |                 "conversations": [
38 |                     {'from': 'human', 'value': f"{input}\n<image>"},
39 |                     {'from': 'gpt', 'value': f"{output}"},
40 |                 ],
41 |             })
42 | 
43 |     print(f'Number of samples: {len(target_format)}')
44 | 
45 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
46 |         json.dump(target_format, f, indent=2)
47 | 
48 | 
49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
50 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
51 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
52 | 
53 |     split_problems = build_prompt_chatbot(
54 |         problems, split_indices, prompt_format,
55 |         use_caption=False, is_test=False)
56 | 
57 |     writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58 |     for prob_id, (input, output) in split_problems.items():
59 |         if input.startswith('Question: '):
60 |             input = input.replace('Question: ', '')
61 |         if output.startswith('Answer: '):
62 |             output = output.replace('Answer: ', '')
63 | 
64 |         raw_prob_data = problems[prob_id]
65 |         if raw_prob_data['image'] is None:
66 |             data = {
67 |                 "id": prob_id,
68 |                 "instruction": f"{input}",
69 |                 "output": f"{output}",
70 |             }
71 | 
72 |         else:
73 |             data = {
74 |                 "id": prob_id,
75 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
76 |                 "instruction": f"{input}\n<image>",
77 |                 "output": f"{output}",
78 |             }
79 |         writer.write(json.dumps(data) + '\n')
80 |     writer.close()
81 | 
82 | 
83 | def main(task, **kwargs):
84 |     globals()[task](**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     fire.Fire(main)
89 | 


--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x['question_id']: x['text'] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x['question_id'] for x in test_split])
32 | 
33 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         assert x['question_id'] in results
41 |         all_answers.append({
42 |             'image': x['image'],
43 |             'answer': answer_processor(results[x['question_id']])
44 |         })
45 | 
46 |     with open(args.result_upload_file, 'w') as f:
47 |         json.dump(all_answers, f)
48 | 


--------------------------------------------------------------------------------
/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/scripts/eval/custom_vqa.sh:
--------------------------------------------------------------------------------
 1 | INIT_MODEL_PATH="/hbox2dir"
 2 | 
 3 | name="qh360_vl-llama3-70B"
 4 | 
 5 | python -m qh360_vl.eval.model_vqa_loader_llama3_nodist \
 6 |     --model-path $INIT_MODEL_PATH/$name \
 7 |     --question-file custom/vqa_test_custom.jsonl \
 8 |     --image-folder custom/vqa \
 9 |     --answers-file custom/$name.jsonl \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --conv-mode llama3
13 | 


--------------------------------------------------------------------------------
/scripts/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="qh360_vl-8B"
 9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 | INIT_MODEL_PATH="/hbox2dir"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m qh360_vl.eval.model_vqa_loader_raw \
15 |         --model-path $INIT_MODEL_PATH/$CKPT \
16 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
17 |         --image-folder ./playground/data/eval/gqa/data/images \
18 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --slide_window \
23 |         --conv-mode llama3 &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
39 | 
40 | cd $GQADIR
41 | python eval/eval.py --tier testdev_balanced


--------------------------------------------------------------------------------
/scripts/eval/infer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INIT_MODEL_PATH="/hbox2dir"
3 | CKPT="qh360_vl-8B"
4 | 
5 | python -m qh360_vl.eval.infer \
6 |     --model-path $INIT_MODEL_PATH/$CKPT \
7 |     --image-path /hbox2dir/test.jpg \
8 |     --slide_window
9 | 


--------------------------------------------------------------------------------
/scripts/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | 
 4 | name="qh360_vl-llama3-70B"
 5 | python -m qh360_vl.eval.model_vqa \
 6 |     --model-path $INIT_MODEL_PATH/$name \
 7 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
 8 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
 9 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/$name.jsonl \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --conv-mode llama3
13 | 
14 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
15 | 
16 | python qh360_vl/eval/eval_gpt_review_bench.py \
17 |     --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
18 |     --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
19 |     --rule llava/eval/table/rule.json \
20 |     --answer-list \
21 |         playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
22 |         playground/data/eval/llava-bench-in-the-wild/answers/$name.jsonl \
23 |     --output \
24 |         playground/data/eval/llava-bench-in-the-wild/reviews/$name.jsonl
25 | 
26 | python qh360_vl/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/$name.jsonl


--------------------------------------------------------------------------------
/scripts/eval/mmb_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | for SPLIT in {"mmbench_dev_cn_20231003","mmbench_test_cn_20231003",}
 6 | do
 7 |     torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mmbench_llama3 \
 8 |         --model-path $INIT_MODEL_PATH/$CKPT \
 9 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
10 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
11 |         --single-pred-prompt \
12 |         --temperature 0 \
13 |         --slide_window \
14 |         --lang cn \
15 |         --conv-mode llama3 \
16 | 
17 |     mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
18 | 
19 |     python scripts/convert_mmbench_for_submission.py \
20 |         --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
21 |         --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
22 |         --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
23 |         --experiment $CKPT
24 | done


--------------------------------------------------------------------------------
/scripts/eval/mmb_en.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | for SPLIT in {"mmbench_dev_en_20231003","mmbench_test_en_20231003",}
 6 | do
 7 |     torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mmbench_llama3 \
 8 |         --model-path $INIT_MODEL_PATH/$CKPT \
 9 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
10 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
11 |         --single-pred-prompt \
12 |         --temperature 0 \
13 |         --slide_window \
14 |         --lang en \
15 |         --conv-mode llama3 \
16 | 
17 |     mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
18 | 
19 |     python scripts/convert_mmbench_for_submission.py \
20 |         --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
21 |         --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
22 |         --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
23 |         --experiment $CKPT
24 | done


--------------------------------------------------------------------------------
/scripts/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_mme_llama3 \
 6 |     --model-path $INIT_MODEL_PATH/$CKPT \
 7 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 8 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --conv-mode llama3
13 | 
14 | cd ./playground/data/eval/MME
15 | python convert_answer_to_mme.py --experiment $CKPT
16 | 
17 | cd eval_tool
18 | python calculation.py --results_dir answers/$CKPT


--------------------------------------------------------------------------------
/scripts/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | for SPLIT in {"validation","test",}
 6 | do
 7 |     python -m qh360_vl.eval.model_vqa_mmmu \
 8 |         --model-path $INIT_MODEL_PATH/$CKPT \
 9 |         --data-path ./playground/data/eval/mmmu/MMMU \
10 |         --config-path ./playground/data/eval/mmmu/config.yaml \
11 |         --output-path ./playground/data/eval/mmmu/answers_upload/$SPLIT/$CKPT.json \
12 |         --split $SPLIT \
13 |         --slide_window \
14 |         --conv-mode llama3
15 |     
16 |     if [[ $SPLIT == "validation" ]]
17 |     then
18 |         python ./playground/data/eval/mmmu/eval_mmmu.py \
19 |             --output-path ./playground/data/eval/mmmu/answers_upload/$SPLIT/$CKPT.json
20 |     fi
21 | done


--------------------------------------------------------------------------------
/scripts/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_pope_llama3 \
 6 |     --model-path $INIT_MODEL_PATH/$CKPT \
 7 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
 8 |     --image-folder ./playground/data/eval/pope/val2014 \
 9 |     --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --conv-mode llama3
13 | 
14 | python qh360_vl/eval/eval_pope.py \
15 |     --annotation-dir ./playground/data/eval/pope/coco \
16 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
17 |     --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl


--------------------------------------------------------------------------------
/scripts/eval/refcoco.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_refcoco_llama3 \
 6 |     --model-path $INIT_MODEL_PATH/$CKPT \
 7 |     --question-file ./playground/data/eval/refcoco/REFCOCO_VAL_en_new.jsonl \
 8 |     --image-folder ./playground/data/eval/refcoco/train2014 \
 9 |     --answers-file ./playground/data/eval/res_test/$CKPT/refcoco.json \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --patch_img_size 336 \
13 |     --conv-mode llama3 \
14 |     
15 | python ./qh360_vl/eval/compute_precision.py ./playground/data/eval/res_test/$CKPT/refcoco.json


--------------------------------------------------------------------------------
/scripts/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | INIT_MODEL_PATH="/hbox2dir"
 3 | CKPT="qh360_vl-8B"
 4 | 
 5 | torchrun --nproc_per_node 8 -m qh360_vl.eval.model_vqa_textvqa_llama3 \
 6 |     --model-path $INIT_MODEL_PATH/$CKPT \
 7 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
 8 |     --image-folder ./playground/data/eval/textvqa/train_images \
 9 |     --answers-file ./playground/data/eval/textvqa/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --slide_window \
12 |     --conv-mode llama3
13 | 
14 | python -m qh360_vl.eval.eval_textvqa \
15 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
16 |     --result-file ./playground/data/eval/textvqa/answers/$CKPT.jsonl


--------------------------------------------------------------------------------
/scripts/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="qh360_vl-8B"
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | INIT_MODEL_PATH="/hbox2dir"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m qh360_vl.eval.model_vqa_loader_raw \
14 |         --model-path $INIT_MODEL_PATH/$CKPT \
15 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
16 |         --image-folder ./playground/data/eval/vqav2/test2015 \
17 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --slide_window \
22 |         --conv-mode llama3 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------