├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── data.png
    ├── demo_all.gif
    ├── demo_box.gif
    ├── demo_point.gif
    ├── framework.png
    ├── osprey.png
    ├── performance.png
    ├── qmsht.gif
    ├── qyqx.gif
    ├── table1.png
    ├── table2.png
    ├── table3.png
    ├── table4.png
    ├── table5.png
    ├── table6.png
    └── video_cover.png
├── dataset.md
├── demo
    ├── app.py
    ├── inference.py
    └── osprey_inference.py
├── osprey
    ├── __init__.py
    ├── configs
    │   ├── stage2.json
    │   └── stage3.json
    ├── constants.py
    ├── conversation.py
    ├── data_generation
    │   ├── ask_gpt.py
    │   ├── concise_qa
    │   │   ├── ask_example.txt
    │   │   ├── res_example.txt
    │   │   └── system_message.txt
    │   ├── conversation
    │   │   ├── ask_example.txt
    │   │   ├── res_example.txt
    │   │   └── system_message.txt
    │   ├── data_generation_pipeline.sh
    │   ├── description
    │   │   ├── ask_example.txt
    │   │   ├── res_example.txt
    │   │   └── system_message.txt
    │   ├── generate_gpt_prompt.py
    │   └── gpt_data_generation.py
    ├── datasets
    │   ├── data_modules.py
    │   ├── osprey_724k.py
    │   ├── stage2_data.py
    │   ├── vcr.py
    │   └── vg.py
    ├── eval
    │   ├── README.md
    │   ├── datasets
    │   │   ├── README.md
    │   │   ├── ade20k_instance_catid_mapping.txt
    │   │   ├── prepare_ade20k_ins_seg.py
    │   │   ├── prepare_ade20k_pan_seg.py
    │   │   └── prepare_ade20k_sem_seg.py
    │   ├── description
    │   │   ├── answers.json
    │   │   ├── prompt.json
    │   │   └── questions.json
    │   ├── eval_gpt.py
    │   ├── eval_open_vocab_seg_detectron2.py
    │   ├── ferret-bench
    │   │   ├── box_refer_caption.json
    │   │   └── box_refer_reason.json
    │   ├── ferret_bench_eval.py
    │   ├── gpt_eval.sh
    │   ├── lvis_paco_eval.py
    │   ├── osprey_generate_gpt_description_answer.py
    │   ├── pope
    │   │   └── evaluate.py
    │   ├── pope_eval.py
    │   ├── pope_eval.sh
    │   ├── refcocog_eval.py
    │   ├── rule.json
    │   ├── summarize_gpt_score.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── ade20k_150_with_prompt_eng.txt
    │   │   ├── cityscapes_with_prompt_eng.txt
    │   │   ├── instance_evaluation.py
    │   │   ├── openseg_classes.py
    │   │   ├── register_ade20k_panoptic.py
    │   │   └── register_cityscapes_panoptic.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   └── osprey_llama.py
    │   ├── layer.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   ├── clip.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── osprey_arch.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── osprey_trainer.py
    │   ├── train.py
    │   └── train_mem.py
    └── utils.py
├── pyproject.toml
└── scripts
    ├── stage2.sh
    ├── stage3.sh
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json


/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__
2 | .DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center" width="100%">
  2 | <img src="assets/osprey.png"  width="90%">
  3 | </p>
  4 | 
  5 | <div align=center>
  6 | 
  7 | ![Static Badge](https://img.shields.io/badge/Osprey-v1-F7C97E) [![arXiv preprint](https://img.shields.io/badge/arxiv-2312.10032-ECA8A7?logo=arxiv)](https://arxiv.org/pdf/2312.10032.pdf) [![Dataset](https://img.shields.io/badge/Dataset-Hugging_Face-CFAFD4)](https://huggingface.co/datasets/AntGroup-MI/Osprey-724K) [![video](https://img.shields.io/badge/Watch_Video-36600E?logo=youtube&logoColor=green)](https://youtu.be/YsxqHBBnDfk) [![Static Badge](https://img.shields.io/badge/Try_Demo-6B88E3?logo=youtubegaming&logoColor=DAE4EE)](http://111.0.123.204:8000/) 
  8 | </div>
  9 | 
 10 | 
 11 | <div align=center>
 12 | Demo username & password: <b>osprey</b>
 13 | </div>
 14 | 
 15 | ---
 16 | 
 17 | <div align=center>
 18 | <img src="./assets/qmsht.gif" />
 19 | <br>    
 20 | A part of <i>Along the River During the Qingming Festival</i> (清明上河图)
 21 | <br> 
 22 | <img src="./assets/qyqx.gif" />
 23 | <br>    
 24 | <i>Spirited Away</i> (千与千寻)
 25 | <br> 
 26 | 
 27 | </div>
 28 | 
 29 | <details open><summary>💡 Some of our other multimodal-LLM projects may interest you ✨. </summary><p>
 30 | <!--  may -->
 31 | 
 32 | > [**VideoRefer Suite: Advancing Spatial-Temporal Object Understanding with Video LLM**](https://arxiv.org/abs/2501.00599) <br>
 33 | > Yuqian Yuan, Hang Zhang, Wentong Li, Zesen Cheng, Boqiang Zhang, Long Li, Xin Li, Deli Zhao, Wenqiao Zhang, Yueting Zhuang, Jianke Zhu, Lidong Bing <br>
 34 | [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/DAMO-NLP-SG/VideoRefer)  [![github](https://img.shields.io/github/stars/DAMO-NLP-SG/VideoRefer.svg?style=social)](https://github.com/DAMO-NLP-SG/VideoRefer)  [![arXiv](https://img.shields.io/badge/Arxiv-2501.00599-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2501.00599) <br>
 35 | 
 36 | > [**TokenPacker: Efficient Visual Projector for Multimodal LLM**](https://arxiv.org/abs/2407.02392) <br>
 37 | > Wentong Li*, Yuqian Yuan*, Jian Liu, Dongqi Tang, Song Wang, Jianke Zhu, Lei Zhang <br>
 38 | [![github](https://img.shields.io/badge/-Github-black?logo=github)](https://github.com/CircleRadon/TokenPacker)  [![github](https://img.shields.io/github/stars/CircleRadon/TokenPacker.svg?style=social)](https://github.com/CircleRadon/TokenPacker)  [![arXiv](https://img.shields.io/badge/Arxiv-2407.02392-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2407.02392) <br>
 39 | 
 40 | 
 41 | ## Updates 📌
 42 | 
 43 | [2025/4/22]🔥 Our defined metrics (Sem. Sim. & Sem. IoU) on Referring Object Classification have been adopted in [Describe Anything Model](https://arxiv.org/pdf/2504.16072) (NVIDIA & UC Berkeley).
 44 | 
 45 | [2025/2/27]🔥 Our new work, [VideoRefer Suite](https://github.com/DAMO-NLP-SG/VideoRefer), has been accept to CVPR2025! This project focuses on video referring.
 46 | 
 47 | [2024/11/27]🔥 Our defined metrics (Sem. Sim. & Sem. IoU) on Referring Object Classification have been adopted in [ChatRex](https://arxiv.org/abs/2411.18363) (IDEA).
 48 | 
 49 | [2024/3/29]🔥 We released [Osprey-Chat](https://huggingface.co/sunshine-lwt/Osprey-Chat-7b/tree/main) model, which exhibits better conversation and image-level understanding&reasoning capabilities.
 50 | 
 51 | [2024/2/27]🔥 Osprey has been accepted to CVPR2024!
 52 | 
 53 | [2024/1/15]🔥 We released the [evaluation](./osprey/eval/README.md) code.
 54 | 
 55 | [2023/12/29]🔥 We released the training code and [Osprey-724K](https://huggingface.co/datasets/AntGroup-MI/Osprey-724K) dataset.
 56 | 
 57 | [2023/12/18]🔥 We released the code, [osprey-7b model](https://huggingface.co/sunshine-lwt/Osprey-7b/tree/main) and [online demo](http://111.0.123.204:8000/) for Osprey.
 58 | 
 59 | 
 60 | ## What is Osprey 👀
 61 | Osprey is a mask-text instruction tuning approach that extends MLLMs by incorporating pixel-wise mask regions into language instructions, enabling **fine-grained visual understanding**.  Based on input mask region, Osprey generate the semantic descriptions including **short description** and **detailed description**.
 62 | 
 63 | Our Osprey can seamlessly integrate with [SAM](https://github.com/facebookresearch/segment-anything) in point-prompt, box-prompt and segmentation everything modes to generate the semantics associated with specific parts or objects.
 64 | 
 65 | <img src="./assets/framework.png" width="800px">
 66 | 
 67 | ## Watch Video Demo 🎥
 68 | 
 69 | <p align="center"> <a href="https://youtu.be/YsxqHBBnDfk"><img src="assets/video_cover.png" width="70%"></a> </p>
 70 | 
 71 | 
 72 | ## Try Our Demo 🕹️
 73 | ### Online demo
 74 | **Click** 👇 **to try our demo online.**
 75 | 
 76 | [**web demo**](http://111.0.123.204:8000/)
 77 | 
 78 | ```
 79 | username: osprey
 80 | password: osprey
 81 | ```
 82 | 
 83 | <table>
 84 |   <tr>
 85 |     <td style="text-align: center"><br>Point<br></td>
 86 |     <td><img src="./assets/demo_point.gif" width="700"></td>
 87 |   </tr>
 88 |     <tr>
 89 |     <td style="text-align: center"><br>Box<br></td>
 90 |     <td><img src="./assets/demo_box.gif" width="700"></td>
 91 |   </tr>
 92 |    </tr>
 93 |     <tr>
 94 |     <td style="text-align: center"><br>Everything<br></td>
 95 |     <td><img src="./assets/demo_all.gif" width="700"></td>
 96 |   </tr>
 97 | </table>
 98 | 
 99 | ### Offline demo
100 | 💻 **requirments:** For this demo, it needs about `17GB` GPU memory for Osprey(15GB) and SAM(2GB).
101 | 
102 | 1. First install [Gradio-Osprey-Demo](https://github.com/LiWentomng/gradio-osprey-demo).
103 | 2. Install Segment Anything.
104 | ```
105 | pip install git+https://github.com/facebookresearch/segment-anything.git
106 | ```
107 | 
108 | 3. Download all the checkpoints:
109 | 
110 | - [Osprey-7b](https://huggingface.co/sunshine-lwt/Osprey-7b/tree/main)
111 | - [CLIP-convnext](https://huggingface.co/laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/blob/main/open_clip_pytorch_model.bin)
112 | - [ViT-B SAM model](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth)
113 | 
114 | The default path of all the checkpoints:
115 | ```
116 | ├── demo
117 |     ├── checkpoints
118 |     │   ├── Osprey_7b
119 |     │   └── sam_vit_b_01ec64.pth 
120 |     └── open_clip_pytorch_model.bin
121 | ```
122 | 
123 | Or change the "mm_vision_tower" in `config.json`  of Osprey-7b model to the Absolute Path of `open_clip_pytorch_model.bin`.
124 | 
125 | 4. Run `app.py`.
126 | ```
127 | cd demo
128 | python app.py --model checkpoints/Osprey_7b
129 | ```
130 | 
131 | ## Install 🛠️
132 | 1. Clone this repository and navigate to Osprey folder
133 | ```
134 | git clone https://github.com/CircleRadon/Osprey.git
135 | cd Osprey
136 | ```
137 | 2. Install packages
138 | ```
139 | conda create -n osprey python=3.10 -y
140 | conda activate osprey
141 | pip install --upgrade pip  # enable PEP 660 support
142 | pip install -e .
143 | ```
144 | 3. Install additional packages for training cases
145 | ```
146 | pip install -e ".[train]"
147 | pip install flash-attn --no-build-isolation
148 | ```
149 | 
150 | ## Dataset 🌟
151 | The all datasets for training can be found in [Dataset preparation](./dataset.md).
152 | 
153 | **Osprey-724K**: 🤗[Hugging Face](https://huggingface.co/datasets/AntGroup-MI/Osprey-724K)
154 | 
155 | `Osprey-724K` is an instruction dataset with mask-text pairs, containing around 724K GPT-generated multimodal dialogues to encourage MLLMs for fine-grained pixel-level image understanding. It contains object-level, part-level and additional instruction samples for robustness and flexibility.
156 | <img src="./assets/data.png" />
157 | 
158 | ## Training 🚀
159 | - **Stage1: Image-Text Alignment Pre-training**
160 |   - The pretrained projector weights for Convnext-large-CLIP can be found in [projector weights](https://huggingface.co/sunshine-lwt/osprey-v1.0-mlp2x-512px-convnext-pretrain-vicuna-7b-v1.5/tree/main).
161 | 
162 | - **Stage2: Mask-Text Alignment Pre-training**
163 |   - Download [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5/tree/main).
164 |   - Download projector weights trained in stage1: [projector weights](https://huggingface.co/sunshine-lwt/osprey-v1.0-mlp2x-512px-convnext-pretrain-vicuna-7b-v1.5/tree/main).
165 |   - Set `model_name_or_path` in `stage2.sh` to the path of `vicuna-7b-v1.5`.
166 |   - Set `pretrain_mm_mlp_adapter` in `stage2.sh` to the path of `mm_projector`.
167 |   - Set `vision_tower` in `stage2.sh` to the path of [Convnext-large-CLIP-model](https://huggingface.co/laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/blob/main/open_clip_pytorch_model.bin).
168 |   - Run `sh scripts/stage2.sh`.
169 | 
170 | - **Stage3: End-to-End Fine-tuning**
171 | 
172 |   - Set `model_name_or_path` in `stage2.sh` to the path of `stage2 checkpoint`.
173 |   - Set `vision_tower` in `stage2.sh` to the path of [Convnext-large-CLIP-model](https://huggingface.co/laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/blob/main/open_clip_pytorch_model.bin).
174 |   - Run `sh scripts/stage3.sh`.
175 | 
176 | 
177 | ## Checkpoints 🤖
178 | 
179 | Osprey-7b model🤗: [model](https://huggingface.co/sunshine-lwt/Osprey-7b/tree/main)
180 | 
181 | We also provide the checkpoint of intermediate stage2, please check [model](https://huggingface.co/sunshine-lwt/Osprey-7b-stage2/tree/main).
182 | 
183 | <div align=center>
184 | <img src="./assets/performance.png" />
185 | </div>
186 | 
187 | 
188 | ## Evaluation 🔎
189 | See [evaluation](./osprey/eval/README.md) for details.
190 | 
191 | 
192 | ## TODO List 📝
193 | - [x] Release the checkpoints, inference codes and demo.
194 | - [x] Release the dataset and training scripts.
195 | - [x] Release the evaluation code.
196 | - [x] Release the code for data generation pipeline.
197 | 
198 | 
199 | ## Acknowledgement 💌
200 | - [LLaVA-v1.5](https://github.com/haotian-liu/LLaVA): the codebase we built upon.
201 | - [SAM](https://github.com/facebookresearch/segment-anything): the demo uses the segmentation result from SAM as the input of Osprey.
202 | 
203 | 
204 | ## BibTeX 🖊️
205 | ```
206 | @misc{Osprey,
207 |   title={Osprey: Pixel Understanding with Visual Instruction Tuning},
208 |   author={Yuqian Yuan, Wentong Li, Jian Liu, Dongqi Tang, Xinjie Luo, Chi Qin, Lei Zhang and Jianke Zhu},
209 |   year={2023},
210 |   eprint={2312.10032},
211 |   archivePrefix={arXiv},
212 |   primaryClass={cs.CV}
213 | }
214 | ```
215 | 


--------------------------------------------------------------------------------
/assets/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/data.png


--------------------------------------------------------------------------------
/assets/demo_all.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/demo_all.gif


--------------------------------------------------------------------------------
/assets/demo_box.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/demo_box.gif


--------------------------------------------------------------------------------
/assets/demo_point.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/demo_point.gif


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/framework.png


--------------------------------------------------------------------------------
/assets/osprey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/osprey.png


--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/performance.png


--------------------------------------------------------------------------------
/assets/qmsht.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/qmsht.gif


--------------------------------------------------------------------------------
/assets/qyqx.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/qyqx.gif


--------------------------------------------------------------------------------
/assets/table1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table1.png


--------------------------------------------------------------------------------
/assets/table2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table2.png


--------------------------------------------------------------------------------
/assets/table3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table3.png


--------------------------------------------------------------------------------
/assets/table4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table4.png


--------------------------------------------------------------------------------
/assets/table5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table5.png


--------------------------------------------------------------------------------
/assets/table6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/table6.png


--------------------------------------------------------------------------------
/assets/video_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/assets/video_cover.png


--------------------------------------------------------------------------------
/dataset.md:
--------------------------------------------------------------------------------
 1 | # Dataset Preparation
 2 | 
 3 | - Osprey-724K 🤗 [download](https://huggingface.co/datasets/AntGroup-MI/Osprey-724K)
 4 | 
 5 | | Data | Size |
 6 | | --- | ---: |
 7 | | osprey_short_form.json | 57 MB |
 8 | | osprey_conversation.json |  106 MB |
 9 | | osprey_detail_description.json | 63.4 MB |
10 | | osprey_part_level.json | 153 MB |
11 | | osprey_lvis_positive_negative.json | 140 MB |
12 | 
13 | 
14 | - COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), `imgs` should contain all the images including training set and validation set.
15 | - pascal_part: [train.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-TrainingData/resolve/main/pascalpart_train.json?download=true), [VOCdevkit](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar).
16 | - partImagenet: [train_format.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-TrainingData/resolve/main/partImagenet_train_format.json?download=true),
17 | [PartImageNet_OOD](https://drive.google.com/file/d/19kA8-pAxssQI0GD5H8y8KESGaALwChtx/view?usp=sharing).
18 | - refcocos: [refcoco](https://huggingface.co/datasets/sunshine-lwt/Osprey-TrainingData/resolve/main/finetune_refcoco_train_with_mask.json?download=true), [refcoco+](https://huggingface.co/datasets/sunshine-lwt/Osprey-TrainingData/resolve/main/finetune_refcoco%2B_train_with_mask.json?download=true).
19 | - vg: [vg_train_with_mask.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-TrainingData/resolve/main/vg_train_with_mask.json?download=true) (mask is generated from [HQ-SAM](https://github.com/SysCV/sam-hq)), images can be downloaded from [OpendataLab](https://opendatalab.com/OpenDataLab/Visual_Genome_Dataset_V1_dot_2), `image` should contain all the vg images(VG_100K and VG_100K_2).
20 | - vcr: [vcr](https://visualcommonsense.com/download/).
21 | 
22 | After downloading all of them, organize the data as follows in `./data`,
23 | 
24 | 
25 | ```
26 | ├── coco
27 | │   ├── annotations
28 | │   │   └── instances_train2017.json
29 | │   └── imgs
30 | ├── part data
31 | │   ├── pascal_part
32 | │   │   ├── train.json
33 | │   │   └── VOCdevkit
34 | │   └── partImagenet
35 | │       ├── train_format.json
36 | │       └── train
37 | ├── refcocos
38 | │   ├── finetune_refcoco_train_with_mask.json
39 | │   └── finetune_refcoco+_train_with_mask.json
40 | ├── Osprey-724K
41 | │   ├── osprey_short_form.json
42 | │   ├── osprey_conversation.json
43 | │   ├── osprey_detail_description.json
44 | │   ├── osprey_part_level.json
45 | │   └── osprey_lvis_positive_negative.json
46 | ├── vg
47 | │   ├── vg_train_with_mask.json
48 | │   └── image
49 | └── vcr
50 |     ├── train.jsonl
51 |     └── vcr1images
52 | ```
53 | 


--------------------------------------------------------------------------------
/demo/inference.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from segment_anything import SamPredictor, sam_model_registry, SamAutomaticMaskGenerator
 6 | 
 7 | models = {
 8 |   'vit_b': './checkpoints/sam_vit_b_01ec64.pth',
 9 |   'vit_l': './checkpoints/sam_vit_l_0b3195.pth',
10 |   'vit_h': './checkpoints/sam_vit_h_4b8939.pth'
11 | }
12 | 
13 | def get_sam_predictor(model_type='vit_b', device='cuda'):
14 |   # sam model
15 |   sam = sam_model_registry[model_type](checkpoint=models[model_type])
16 |   sam = sam.to(device)
17 | 
18 |   predictor = SamPredictor(sam)
19 | 
20 |   return predictor
21 | 
22 | def get_mask_generator(model_type='vit_b', device='cuda'):
23 |   sam = sam_model_registry[model_type](checkpoint=models[model_type])
24 |   sam = sam.to(device)
25 |   mask_generator = SamAutomaticMaskGenerator(
26 |       model=sam)
27 |   return mask_generator
28 | 
29 | def run_inference(predictor: SamPredictor, input_x, selected_points,
30 |                   multi_object: bool = False):
31 | 
32 |   if len(selected_points) == 0:
33 |     return []
34 | 
35 |   predictor.set_image(input_x)
36 | 
37 |   points = torch.Tensor(
38 |       [p for p, _ in selected_points]
39 |   ).to(predictor.device).unsqueeze(0)
40 | 
41 |   labels = torch.Tensor(
42 |       [int(l) for _, l in selected_points]
43 |   ).to(predictor.device).unsqueeze(0)
44 | 
45 |   transformed_points = predictor.transform.apply_coords_torch(
46 |       points, input_x.shape[:2])
47 |   # print(transformed_points.shape)
48 |   # predict segmentation according to the boxes
49 |   masks, scores, logits = predictor.predict_torch(
50 |     point_coords=transformed_points,
51 |     point_labels=labels,
52 |     multimask_output=False,
53 |   )
54 |   masks = masks.cpu().detach().numpy()
55 | 
56 |   gc.collect()
57 |   torch.cuda.empty_cache()
58 | 
59 |   return masks
60 | 
61 | 
62 | def predict_box(predictor: SamPredictor, input_x, input_box):
63 |   predictor.set_image(input_x)
64 | 
65 |   input_boxes = torch.tensor(input_box[None, :], device=predictor.device)
66 |   transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, input_x.shape[:2])
67 | 
68 |   masks, _, _ = predictor.predict_torch(
69 |     point_coords=None,
70 |     point_labels=None,
71 |     boxes = transformed_boxes,
72 |     multimask_output = False
73 |   )
74 |   masks = masks.cpu().detach().numpy()
75 |   
76 |   gc.collect()
77 |   torch.cuda.empty_cache()
78 |   return masks


--------------------------------------------------------------------------------
/demo/osprey_inference.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from osprey.utils import disable_torch_init
  3 | from transformers import AutoTokenizer, CLIPImageProcessor
  4 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
  5 | from osprey.mm_utils import tokenizer_image_token
  6 | from osprey.conversation import conv_templates, SeparatorStyle
  7 | from osprey.constants import IMAGE_TOKEN_INDEX
  8 | from osprey.train.train import DataArguments
  9 | 
 10 | from functools import partial
 11 | import os
 12 | import numpy as np
 13 | import cv2
 14 | 
 15 | data_args = DataArguments()
 16 | data_args.mm_use_im_start_end = False
 17 | data_args.is_multimodal = True
 18 | 
 19 | def show_mask(mask, image, random_color=True, img_trans=0.9, mask_trans=0.5, return_color=False):
 20 |   if random_color:
 21 |     color = np.concatenate([np.random.random(3)*255], axis=0)
 22 |   else:
 23 |     color = np.array([30, 144, 255])
 24 |   h,w = mask.shape[-2:]
 25 |   mask_image = mask.reshape(h,w,1)*color.reshape(1,1,-1)
 26 | 
 27 |   image = cv2.addWeighted(image, img_trans, mask_image.astype('uint8'), mask_trans , 0)
 28 |   if return_color:
 29 |     return image, mask_image
 30 |   else:
 31 |     return image
 32 | 
 33 | class Osprey():
 34 |     def __init__(self, model_path, device='cuda'):
 35 |         disable_torch_init()
 36 |         model_path = os.path.expanduser(model_path)
 37 |         self.tokenizer = AutoTokenizer.from_pretrained(
 38 |             model_path,
 39 |             model_max_length=2048,
 40 |             padding_side="right",
 41 |             use_fast=True
 42 |         )
 43 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 44 |                                                 model_path,
 45 |                                                 torch_dtype=torch.bfloat16,
 46 |                                                 ).to(device)
 47 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 48 | 
 49 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 50 |                                                     do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 51 |                                                     image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 52 |         
 53 |         spi_tokens = ['<mask>', '<pos>']
 54 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 55 |         
 56 |         for m in self.model.modules():
 57 |             m.tokenizer = self.tokenizer
 58 | 
 59 |         vision_tower = self.model.get_vision_tower()
 60 |         if not vision_tower.is_loaded:
 61 |             vision_tower.load_model()
 62 |         vision_tower.to(dtype=torch.float16, device=device)
 63 | 
 64 |         begin_str = """<image>\n\nThis provides an overview of the picture.\n"""
 65 | 
 66 |         short_question = 'Please give me a short description of <mask><pos>. Using a short phrase.'
 67 | 
 68 |         conv = conv_templates['osprey_v1'].copy()
 69 |         qs = begin_str+short_question
 70 |         conv.append_message(conv.roles[0], qs)
 71 |         conv.append_message(conv.roles[1], None)
 72 |         prompt = conv.get_prompt()
 73 | 
 74 |         self.input_ids_short = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
 75 | 
 76 |         detailed_question = 'Can you give me a detailed description of <mask><pos>?'
 77 |         
 78 |         conv = conv_templates['osprey_v1'].copy()
 79 |         qs = begin_str+detailed_question
 80 |         conv.append_message(conv.roles[0], qs)
 81 |         conv.append_message(conv.roles[1], None)
 82 |         prompt = conv.get_prompt()
 83 | 
 84 |         self.input_ids_detailed = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
 85 | 
 86 |         self.stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 87 | 
 88 | 
 89 | 
 90 |     def osprey_predict(self, img, mask, type=None):
 91 |         image = self.image_processor.preprocess(img,
 92 |                                 do_center_crop=False,
 93 |                                 return_tensors='pt')['pixel_values'][0]
 94 | 
 95 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
 96 |                                                 size=(512, 512),
 97 |                                                 mode='bilinear',
 98 |                                                 align_corners=False).squeeze(0)
 99 | 
100 |         masks = torch.Tensor(mask).unsqueeze(0).to(self.model.device)
101 | 
102 |         
103 |         if type == 'short description':
104 |            input_ids = self.input_ids_short
105 |         else:
106 |            input_ids = self.input_ids_detailed
107 | 
108 |         # self.model.model.tokenizer = self.tokenizer
109 | 
110 |         with torch.inference_mode():
111 | 
112 |             self.model.orig_forward = self.model.forward
113 |             self.model.forward = partial(self.model.orig_forward,
114 |                                         img_metas=[None],
115 |                                         masks=[masks.half()])
116 |             
117 |             output_ids = self.model.generate(
118 |                 input_ids,
119 |                 images=image.unsqueeze(0).half().to(self.model.device),
120 |                 do_sample=True,
121 |                 temperature=0.2,
122 |                 max_new_tokens=1024,
123 |                 use_cache=True,
124 |                 num_beams=1,
125 |                 # stopping_criteria=[stopping_criteria]
126 |                 )
127 | 
128 |             self.model.forward = self.model.orig_forward
129 | 
130 |         input_token_len = input_ids.shape[1]
131 |         n_diff_input_output = (
132 |             input_ids != output_ids[:, :input_token_len]).sum().item()
133 |         if n_diff_input_output > 0:
134 |             print(
135 |                 f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
136 |         outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
137 |                                             skip_special_tokens=True)[0]
138 |     
139 |         outputs = outputs.strip()
140 |         if outputs.endswith(self.stop_str):
141 |             outputs = outputs[:-len(self.stop_str)]
142 |         outputs = outputs.strip()
143 |         if ':' in outputs:
144 |             outputs = outputs.split(':')[1]
145 | 
146 |         outputs_list = outputs.split('.')
147 |         outputs_list_final = []
148 |         outputs_str = ''
149 |         for output in outputs_list:
150 |             if output not in outputs_list_final:
151 |                 if output=='':
152 |                     continue
153 |                 outputs_list_final.append(output)
154 |                 outputs_str+=output+'.'
155 |             else:
156 |                 break
157 |         return outputs_str


--------------------------------------------------------------------------------
/osprey/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import OspreyLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/osprey/configs/stage2.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "type": "coco_data",
 4 |         "ann_file": "./data/coco/annotations/instances_train2017.json",
 5 |         "img_prefix": "./data/coco/imgs"
 6 |     },
 7 | 
 8 |     {
 9 |         "type": "RefCOCO",
10 |         "ann_file": "./data/refcocos/finetune_refcoco_train_with_mask.json",
11 |         "img_prefix": "./data/coco/imgs"
12 |      },
13 | 
14 |     {
15 |         "type": "RefCOCOP",
16 |         "ann_file": "./data/refcocos/finetune_refcoco+_train_with_mask.json",
17 |         "img_prefix": "./data/coco/imgs"
18 |      },
19 | 
20 |     {
21 |         "type": "PascalPart",
22 |         "ann_file": "./data/part_data/pascal_part/train.json",
23 |         "img_prefix": "./data/part_data/pascal_part/VOCdevkit/VOC2010/JPEGImages"
24 |      },
25 |      
26 |      {
27 |         "type": "PartImagenet",
28 |         "ann_file": "./data/part_data/partImagenet/train_format.json",
29 |         "img_prefix": "./data/part_data/partImagenet/train/"
30 |      }
31 | ]
32 | 


--------------------------------------------------------------------------------
/osprey/configs/stage3.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "type": "OspreyShortForm",
 4 |         "ann_file": "./data/Osprey-724K/osprey_short_form.json",
 5 |         "img_prefix": "./data/coco_imgs/"
 6 |     },
 7 | 
 8 |     {
 9 |         "type": "OspreyLVISPosNeg",
10 |         "ann_file": "./data/Osprey-724K/osprey_lvis_positive_negative.json",
11 |         "img_prefix": "./data/coco_imgs/"
12 |     },
13 | 
14 |     {
15 |         "type": "OspreyConversations",
16 |         "ann_file": "./data/Osprey-724K/osprey_conversation.json",
17 |         "img_prefix": "./data/coco_imgs/"
18 |     },
19 | 
20 |     {
21 |         "type": "OspreyDetailedDescription",
22 |         "ann_file": "./data/Osprey-724K/osprey_detailed_description.json",
23 |         "img_prefix": "./data/coco_imgs/"
24 |     },
25 |    
26 |     {
27 |         "type": "OspreyPartLevel",
28 |         "ann_file": "./data/Osprey-724K/osprey_part_level.json",
29 |         "img_prefix": "./data/coco_imgs/"
30 |     },
31 | 
32 |     {
33 |         "type": "VGDATA",
34 |         "ann_file": "./data/vg/vg_train_with_mask.json",
35 |         "img_prefix": "./data/vg/image"
36 |     },
37 |     
38 |     {
39 |         "type": "vcr",
40 |         "ann_file": "./data/vcr/train.jsonl",
41 |         "img_prefix": "./data/vcr/vcr1images"
42 |     }
43 | ]
44 | 


--------------------------------------------------------------------------------
/osprey/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/osprey/data_generation/ask_gpt.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | class askGPT():
 4 |     def __init__(self):
 5 |         # fill in the api key here
 6 |         openai.api_key = xxx
 7 |         
 8 |     def ask_gpt(self, question):
 9 |         with open('description/system_message.txt', 'r') as f:
10 |             system_message = f.read()
11 |         with open('description/ask_example.txt', 'r') as f:
12 |             example_ask = f.read()
13 |         with open('description/res_example.txt', 'r') as f:
14 |             example_res = f.read()
15 |         completion = openai.ChatCompletion.create(
16 |             model="gpt-4",
17 |             messages=[
18 |                 {"role": "system", "content": system_message},
19 |                 {"role": "user", "content": example_ask},
20 |                 {"role": "assistant", "content": example_res},
21 |                 {"role": "user", "content": question}
22 |             ]
23 |         )
24 |         return completion.choices[0].message['content']
25 |     
26 |     def ask_gpt_short_conversation(self, question):
27 |         with open('concise_qa/system_message.txt', 'r') as f:
28 |             system_message = f.read()
29 |         with open('concise_qa/ask_example.txt', 'r') as f:
30 |             example_ask = f.read()
31 |         with open('concise_qa/res_example.txt', 'r') as f:
32 |             example_res = f.read()
33 |         completion = openai.ChatCompletion.create(
34 |             model="gpt-4",
35 |             messages=[
36 |                 {"role": "system", "content": system_message},
37 |                 {"role": "user", "content": example_ask},
38 |                 {"role": "assistant", "content": example_res},
39 |                 {"role": "user", "content": question}
40 |             ]
41 |         )
42 |         return completion.choices[0].message['content']
43 | 
44 |     def ask_gpt_conversation(self, question):
45 |         with open('conversation/system_message.txt', 'r') as f:
46 |             system_message = f.read()
47 |         with open('conversation/ask_example.txt', 'r') as f:
48 |             example_ask = f.read()
49 |         with open('conversation/res_example.txt', 'r') as f:
50 |             example_res = f.read()
51 |         completion = openai.ChatCompletion.create(
52 |             model="gpt-4",
53 |             messages=[
54 |                 {"role": "system", "content": system_message},
55 |                 {"role": "user", "content": example_ask},
56 |                 {"role": "assistant", "content": example_res},
57 |                 {"role": "user", "content": question}
58 |             ]
59 |         )
60 |         return completion.choices[0].message['content']
61 | 
62 |     


--------------------------------------------------------------------------------
/osprey/data_generation/concise_qa/ask_example.txt:
--------------------------------------------------------------------------------
 1 | Whole description: "The image presents a lively market scene with a group of people buying fruits and bags. There are multiple individuals in the market, all browsing through the fresh produce available. A significant variety of fruits are showcased in the market, including bananas, oranges, and apples. Bananas can be seen in several groups, with some green and yellow bananas occupying different areas of the market. Meanwhile, oranges and apples are displayed in smaller sections among the fruits. In addition to fruits, handbags are also being sold at the market, attracting the attention of the customers. Overall, the market bustles with activity as people gather around the fresh fruits and bags, contemplating their purchases."
 2 | 
 3 | Description of each region is listed below:
 4 | 
 5 | <region1> (person: [0.507,0.409,0.698,0.740]):
 6 | gray shirt wearing glasses
 7 | woman with gray shirt standing next to man
 8 | woman in gray shirt facing camera on right
 9 | the woman in the grey shirt with a watch on her wrist ..
10 | a short haired woman in jeans shopping
11 | 
12 | <region2> (person: [0.243,0.469,0.558,0.746]):
13 | navy blue shirt
14 | woman back in blue
15 | the lady with the blue shirt
16 | the back of an older woman with her hair in a barrette with a blue jacket on
17 | a woman is wearing blue sweater
18 | 
19 | 


--------------------------------------------------------------------------------
/osprey/data_generation/concise_qa/res_example.txt:
--------------------------------------------------------------------------------
 1 | Question 1: What is the woman in <region1> wearing on her wrist?
 2 | 
 3 | Answer 1: Watch.
 4 | 
 5 | Question 2: What is the color of the older woman's shirt in <region2>?
 6 | 
 7 | Answer 2: Navy blue.
 8 | 
 9 | Question 3: What hair accessory does the older woman in <region2> have?
10 | 
11 | Answer 3: Barrette.
12 | 
13 | Question 4: Which direction is the older woman in <region2> facing?
14 | 
15 | Answer 4: Away.
16 | 
17 | 


--------------------------------------------------------------------------------
/osprey/data_generation/concise_qa/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant, and you are seeing several object regions in a single image. What you see are provided with a detailed description for the whole image and each object region in this image, describing you are looking at. Answer all questions as you are seeing the image.
 2 | The location of each object region is given in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
 3 | Design a conversation between you and a person asking about each object region of this image. The answers must be in one word or one phrase. Ask diverse questions and give corresponding answers. All the regions given should be mentioned in the questions, when referring to each region, use <region1>, <region2>, etc.
 4 | Include questions asking about the visual content of each object region in the image, including the object category, object type, object color, object actions, object locations, relative positions between objects and other attributes etc. Only include questions that have definite answers:
 5 | (1) one can see the content in the object region of this image that the question asks about and can answer confidently;
 6 | (2) one can determine confidently from the object region of this image that it is not in the image.
 7 | Do not ask any question that cannot be answered confidently. 
 8 | Do not ask any question that is not mentioned.
 9 | Do not ask any question that cannot be answered with one word or phrase.
10 | 
11 | Most importantly, the answer must be in one word or short phrase.


--------------------------------------------------------------------------------
/osprey/data_generation/conversation/ask_example.txt:
--------------------------------------------------------------------------------
 1 | Whole description: "The image presents a lively market scene with a group of people buying fruits and bags. There are multiple individuals in the market, all browsing through the fresh produce available. A significant variety of fruits are showcased in the market, including bananas, oranges, and apples. Bananas can be seen in several groups, with some green and yellow bananas occupying different areas of the market. Meanwhile, oranges and apples are displayed in smaller sections among the fruits. In addition to fruits, handbags are also being sold at the market, attracting the attention of the customers. Overall, the market bustles with activity as people gather around the fresh fruits and bags, contemplating their purchases."
 2 | 
 3 | Description of each region is listed below:
 4 | 
 5 | <region1> (person: [0.507,0.409,0.698,0.740]):
 6 | gray shirt wearing glasses
 7 | woman with gray shirt standing next to man
 8 | woman in gray shirt facing camera on right
 9 | the woman in the grey shirt with a watch on her wrist ..
10 | a short haired woman in jeans shopping
11 | 
12 | <region2> (person: [0.243,0.469,0.558,0.746]):
13 | navy blue shirt
14 | woman back in blue
15 | the lady with the blue shirt
16 | the back of an older woman with her hair in a barrette with a blue jacket on
17 | a woman is wearing blue sweater
18 | 
19 | 


--------------------------------------------------------------------------------
/osprey/data_generation/conversation/res_example.txt:
--------------------------------------------------------------------------------
 1 | Question 1: Can you describe the woman in <region1> and what she is doing?
 2 | 
 3 | Answer 1: The woman in <region1> is wearing a gray shirt, jeans, and glasses. She has short hair and a watch on her wrist. She is shopping in the lively market, browsing through the fresh produce, such as fruits and bags.
 4 | 
 5 | Question 2: What can you tell me about the woman in <region2>?
 6 | 
 7 | Answer 2: The woman in <region2> is wearing a navy blue shirt or sweater. She has her back towards the camera, with her hair styled in a barrette. She appears to be an older woman, also browsing through the market, looking at the various items for sale.
 8 | 
 9 | Question 3: Can you compare the positions of the women in <region1> and <region2> within the image?
10 | 
11 | Answer 3: The woman in <region1> is positioned more towards the right side of the image, facing the camera. On the other hand, the woman in <region2> is located on the left side of the image with her back to the camera. Both women are engaged in shopping at the market, surrounded by fresh fruits and bags.
12 | 
13 | Question 4: Which fruits are visible in the image, and are there any other types of merchandise for sale?
14 | 
15 | Answer 4: Bananas, oranges, and apples are visible in the image as the showcased fruits. Besides fruits, there are handbags also being sold at the market. These items are attracting the attention of the customers as they explore the market and contemplate their purchases.
16 | 
17 | Question 5: Based on the image, how would you describe the overall atmosphere of the market?
18 | 
19 | Answer 5: The market has a bustling and lively atmosphere. There's a sense of excitement and energy as people gather around the fresh fruits and bags for sale. The variety of items available, as well as the interactions among customers, contribute to the vibrant ambience of the market scene.


--------------------------------------------------------------------------------
/osprey/data_generation/conversation/system_message.txt:
--------------------------------------------------------------------------------
1 | You are an AI visual assistant, and you are seeing several object regions in a single image. What you see are provided with a detailed description for the whole image and each object region in this image, describing you are looking at. Answer all questions as you are seeing the image.
2 | The location of each object region is given in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 
3 | Design a conversation between you and a person asking about each object region of this image. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. Ask diverse questions and give corresponding answers. All the regions given should be mentioned in the questions, when referring to each region, use <region1>, <region2>, etc.
4 | Include questions asking about the visual content of each object region in the image, including the object category, object type, object color, object actions, object locations, relative positions between objects and other attributes etc. Only include questions that have definite answers:
5 | (1) one can see the content in the object region of this image that the question asks about and can answer confidently;
6 | (2) one can determine confidently from the object region of this image that it is not in the image.
7 | Do not ask any question that cannot be answered confidently.
8 | Also include complex questions that are relevant to the content of each object region in the image, for example, asking about background knowledge of the objects, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.
9 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. You can include multiple paragraphs if necessary.


--------------------------------------------------------------------------------
/osprey/data_generation/data_generation_pipeline.sh:
--------------------------------------------------------------------------------
 1 | # description
 2 | python gpt_data_generation.py \
 3 |     --type description \
 4 |     --outputfile description_gpt4_data.json
 5 | 
 6 | # conversation
 7 | python gpt_data_generation.py \
 8 |     --type conversation \
 9 |     --outputfile conversation_gpt4_data.json
10 | 
11 | # short-form conversation
12 | python gpt_data_generation.py \
13 |     --type short-form \
14 |     --outputfile short_form_conversation_gpt4_data.json
15 | 


--------------------------------------------------------------------------------
/osprey/data_generation/description/ask_example.txt:
--------------------------------------------------------------------------------
 1 | The detailed description of this image:
 2 | In the image, a man wearing a white apron and a ball cap is standing in a kitchen. He appears to be a chef or a baker and is actively sliding a pizza into a brick oven. The kitchen is well-equipped with various tools and utensils, such as spoons and bowls, which are placed on the dining table and countertops. A chair is positioned near the table, and multiple bowls can be found throughout the kitchen. The scene conveys a sense of preparation and anticipation for the dish being cooked.
 3 | chair: [0.437,0.840,0.808,1.000]
 4 | person: [0.530,0.881,0.998,0.987]
 5 | person: [0.223,0.202,0.580,0.954]
 6 | spoon: [0.068,0.423,0.089,0.435]
 7 | Specially, there are 2 special regions: <region1> and <region2>.
 8 | For each one region, you receive several sentences as the description of this region in this image you are observing.
 9 | For <regin1>(person: [0.223,0.202,0.580,0.954]):
10 | man chef
11 | man cooking
12 | 
13 | For <regin2>(person: [0.530,0.881,0.998,0.987]):
14 | i think its a table infront of the chair
15 | top of head bottom right corner
16 | part of head seen near chair


--------------------------------------------------------------------------------
/osprey/data_generation/description/res_example.txt:
--------------------------------------------------------------------------------
1 | <region1>: A man wearing a white apron, possibly a chef or a baker, is actively cooking in a well-equipped kitchen. He is standing near the brick oven and appears to be sliding a pizza into it.
2 | 
3 | <region2>: The top of a person's head is partially visible near the chair, it appears to be a brown-haired individual, though not much else can be discerned from this angle.


--------------------------------------------------------------------------------
/osprey/data_generation/description/system_message.txt:
--------------------------------------------------------------------------------
1 | You are an AI visual assistant that can analyze a single image. You receive a detailed description/several descriptions of this image. In addition, most object locations within the image are given, along with detailed coordinates.
2 | These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1.These values correspond to the top left x, top left y, bottom right x, and bottom right y.
3 | Your role is to give a detailed description of each special region in the image. Instead of directly mentioning the bounding box coordinates, utilize this data to explain each region using natural language. Include details like object category, object type, object color, attributes of the object, object locations, object state and other attributes.
4 | When using the information from the image and object region captions and coordinates, directly explain the region, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at each region. Provide a direct answer without mention "this region". The answer template is: "<region1>: ..."


--------------------------------------------------------------------------------
/osprey/data_generation/gpt_data_generation.py:
--------------------------------------------------------------------------------
  1 | from ask_gpt import askGPT
  2 | from generate_gpt_prompt import GeneratePrompt, COCODataset
  3 | import json
  4 | from tqdm import tqdm
  5 | import re
  6 | import argparse
  7 | 
  8 | QUESTIONS =  [
  9 |     'Can you provide me with a detailed description of the region in the picture marked by <region>?',
 10 |     "I'm curious about the region represented by <region> in the picture. Could you describe it in detail?",
 11 |     'What can you tell me about the region indicated by <region> in the image?',
 12 |     "I'd like to know more about the area in the photo labeled <region>. Can you give me a detailed description?",
 13 |     'Could you describe the region shown as <region> in the picture in great detail?',
 14 |     'What details can you give me about the region outlined by <region> in the photo?',
 15 |     'Please provide me with a comprehensive description of the region marked with <region> in the image.',
 16 |     'Can you give me a detailed account of the region labeled as <region> in the picture?',
 17 |     "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail?",
 18 |     'What is the region outlined by <region> in the picture like? Could you give me a detailed description?',
 19 |     'Can you provide me with a detailed description of the region in the picture marked by <region>, please?',
 20 |     "I'm curious about the region represented by <region> in the picture. Could you describe it in detail, please?",
 21 |     'What can you tell me about the region indicated by <region> in the image, exactly?',
 22 |     "I'd like to know more about the area in the photo labeled <region>, please. Can you give me a detailed description?",
 23 |     'Could you describe the region shown as <region> in the picture in great detail, please?',
 24 |     'What details can you give me about the region outlined by <region> in the photo, please?',
 25 |     'Please provide me with a comprehensive description of the region marked with <region> in the image, please.',
 26 |     'Can you give me a detailed account of the region labeled as <region> in the picture, please?',
 27 |     "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in detail, please?",
 28 |     'What is the region outlined by <region> in the picture like, please? Could you give me a detailed description?',
 29 |     'Please describe the region <region> in the image in detail.',
 30 |     'Can you offer a thorough analysis of the region <region> in the image?',
 31 |     'Could you elaborate on the region highlighted by <region> in the picture provided?',
 32 |     'Please share more information about the zone emphasized with <region> in the photo.',
 33 |     'What insights can you give ablout the area denoted by <region> in the image presented?',
 34 |     'Can you share a comprehensive rundown of the region denoted by <region> in the presented image?',
 35 |     "I'd like to know more about the region highlighted by <region> in the picture provided.",
 36 |     'Work through the important details of the area <region> in the image.',
 37 |     'Illustrate the area represtented by <region> through a descriptive explanation.',
 38 |     'Examine the region <region> closely and share its details.'
 39 | ]
 40 | 
 41 | class COCODataConvert():
 42 |     def __init__(self):
 43 |         self.gpt = askGPT()
 44 |         self.generate_gpt_prompt = GeneratePrompt()
 45 |         self.coco = COCODataset()
 46 |     
 47 |     def generate_conversation(self, output_file):
 48 |         results = []
 49 |         imgs = self.coco.img_ids
 50 |         sum = 0
 51 |         for id in (tqdm(imgs)):
 52 |             result = {}
 53 |             prompt, annotations, num_boxes, height, width = self.generate_gpt_prompt.load_data_and_generate_gpt_prompt_description(id,1)
 54 |             # print(prompt)
 55 |             if prompt == None:
 56 |                 # print("none")
 57 |                 continue
 58 | 
 59 |             while True:
 60 |                 try:
 61 |                     ret = self.gpt.ask_gpt_conversation(prompt)
 62 |                     description =ret.json()['data']['reply']
 63 | 
 64 |                     conversations = []
 65 |                     description_list = description.split('\n\n')
 66 |                     for i, des in enumerate(description_list):
 67 |                         conv = {}
 68 |                         if i%2==0:
 69 |                             conv['from'] = "human"
 70 |                             conv['value'] = re.findall(r"Question.*:\ (.*)",des)[0]
 71 |                         else:
 72 |                             conv['from'] = "gpt"
 73 |                             conv['value'] = re.findall(r"Answer.*:\ (.*)",des)[0]
 74 |                         conversations.append(conv)
 75 |                     break
 76 |                 except:
 77 |                     print(ret)
 78 |             
 79 | 
 80 |             img_info = self.coco.coco.load_imgs([id])[0]
 81 |             result['id'] = id
 82 |             result['file_name'] = img_info['file_name']
 83 |             result['conversations'] = conversations
 84 |             result['annotation'] = annotations
 85 |             result['height'] = height
 86 |             result['width'] = width
 87 |             results.append(result)
 88 | 
 89 |             sum+=1
 90 |             print("num:", sum)
 91 | 
 92 |             if sum%100==0:
 93 |                 f = json.dumps(results)
 94 |                 f2 = open(output_file, 'w')
 95 |                 f2.write(f)
 96 |                 f2.close()
 97 | 
 98 |         f = json.dumps(results)
 99 |         f2 = open(output_file, 'w')
100 |         f2.write(f)
101 |         f2.close()
102 | 
103 | 
104 |     def generate_short_conversation(self, output_file):
105 |         results = []
106 |         imgs = self.coco.img_ids
107 |         sum = 0
108 | 
109 |         for id in (tqdm(imgs)):
110 |             # print(id)
111 |             result = {}
112 |             prompt, annotations, num_boxes, height, width = self.generate_gpt_prompt.load_data_and_generate_gpt_prompt_description(id,1)
113 |             # print(prompt)
114 |             if prompt == None:
115 |                 # print("none")
116 |                 continue
117 |             while True:
118 |                 try:
119 |                     ret = self.gpt.ask_gpt_short_conversation(prompt)
120 |                     description =ret.json()['data']['reply']
121 | 
122 |                     conversations = []
123 |                     description_list = description.split('\n\n')
124 | 
125 |                     for i, des in enumerate(description_list):
126 |                         conv = {}
127 |                         if i%2==0:
128 |                             conv['from'] = "human"
129 |                             conv['value'] = re.findall(r"Question.*:\ (.*)",des)[0]
130 |                         else:
131 |                             conv['from'] = "gpt"
132 |                             conv['value'] = re.findall(r"Answer.*:\ (.*)",des)[0]
133 |                         conversations.append(conv)
134 |                     break
135 |                 except:
136 |                     print(ret)                
137 |             
138 |             img_info = self.coco.coco.load_imgs([id])[0]
139 |             result['id'] = id
140 |             result['file_name'] = img_info['file_name']
141 |             result['conversations'] = conversations
142 |             result['annotation'] = annotations
143 |             result['height'] = height
144 |             result['width'] = width
145 |             results.append(result)
146 | 
147 |             sum+=1
148 |             print("num:",sum)
149 | 
150 |             if sum%100==0:
151 |                 f = json.dumps(results)
152 |                 f2 = open(output_file, 'w')
153 |                 f2.write(f)
154 |                 f2.close()
155 | 
156 |         f = json.dumps(results)
157 |         f2 = open(output_file, 'w')
158 |         f2.write(f)
159 |         f2.close()
160 | 
161 | 
162 |     def generate_descriptions(self, output_file):
163 |         results = []
164 |         imgs = self.coco.img_ids
165 |         sum = 0
166 |         for id in tqdm(imgs):
167 |             result = {}
168 |             prompt, annotations, num_boxes, height, width = self.generate_gpt_prompt.load_data_and_generate_gpt_prompt_description(id)
169 |             # print(prompt)
170 | 
171 |             if prompt == None:
172 |                 # print("None")
173 |                 continue
174 | 
175 |             while True:
176 |                 try:
177 |                     description = self.gpt.ask_gpt(prompt)
178 | 
179 |                     description_list = description.split('\n\n')
180 |                     break
181 |                 except:
182 |                     print(description)
183 | 
184 | 
185 |             img_info = self.coco.coco.load_imgs([id])[0]
186 |             result['id'] = id
187 |             result['file_name'] = img_info['file_name']
188 |             result['description'] = description_list
189 |             result['annotation'] = annotations
190 |             result['height'] = height
191 |             result['width'] = width
192 |             results.append(result)
193 | 
194 |             sum+=1
195 |             print("num:",sum)
196 | 
197 |             if sum%100==0:
198 |                 f = json.dumps(results)
199 |                 f2 = open(output_file, 'w')
200 |                 f2.write(f)
201 |                 f2.close()
202 | 
203 |         f = json.dumps(results)
204 |         f2 = open(output_file, 'w')
205 |         f2.write(f)
206 |         f2.close()
207 | 
208 | 
209 | if __name__ == '__main__':
210 |     parser = argparse.ArgumentParser(description='data generation pipline', formatter_class=argparse.RawTextHelpFormatter)
211 |     parser.add_argument('--type', help='generate data type', default='description')
212 |     parser.add_argument('--outputfile', help='output file name', default='description_gpt4_data.json')
213 |     args = parser.parse_args()
214 | 
215 |     convert = COCODataConvert()
216 | 
217 |     if args.type=='description':
218 |         convert.generate_descriptions(args.output_file)
219 |     elif args.type=='conversation':
220 |         convert.generate_conversation(args.output_file)
221 |     elif args.type=='short-form':
222 |         convert.generate_short_conversation(args.output_file)
223 |     else:
224 |         raise NotImplementedError
225 | 


--------------------------------------------------------------------------------
/osprey/datasets/data_modules.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import torch
  3 | import transformers
  4 | from torch.utils.data import ConcatDataset
  5 | import json
  6 | from osprey.constants import IGNORE_INDEX
  7 | 
  8 | from .stage2_data import COCODataset, RefCOCO, RefCOCOP
  9 | from .vcr import VCRDataset
 10 | from .vg import VGDATA
 11 | from .stage2_data import PascalPart
 12 | from .stage2_data import PartImagenet
 13 | from .osprey_724k import OspreyDetailedDescription, OspreyConversations, OspreyShortForm, OspreyPartLevel, OspreyLVISPosNeg
 14 | 
 15 | @dataclass
 16 | class DataCollatorForDetDataset(object):
 17 | 
 18 |     tokenizer: transformers.PreTrainedTokenizer
 19 |     def __call__(self, instances):
 20 | 
 21 |         input_ids, labels, img_metas, masks = tuple([instance.get(key,None) for instance in instances]
 22 |                                   for key in ('input_ids',
 23 |                                               'labels',
 24 |                                               'img_metas',
 25 |                                               'masks'))
 26 |         input_ids = torch.nn.utils.rnn.pad_sequence(
 27 |             input_ids,
 28 |             batch_first=True,
 29 |             padding_value=self.tokenizer.pad_token_id)
 30 |         labels = torch.nn.utils.rnn.pad_sequence(labels,
 31 |                                                  batch_first=True,
 32 |                                                  padding_value=IGNORE_INDEX)
 33 | 
 34 |         batch = dict(
 35 |             input_ids=input_ids,
 36 |             labels=labels,
 37 |             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
 38 |             img_metas=img_metas,
 39 |             masks = masks
 40 |         )
 41 | 
 42 |         if 'image' in instances[0]:
 43 |             images = [instance['image'] for instance in instances]
 44 |             if all(x is not None and x.shape == images[0].shape for x in images):
 45 |                 batch['images'] = torch.stack(images)
 46 |             else:
 47 |                 batch['images'] = images
 48 | 
 49 |         return batch
 50 | 
 51 | def make_multitask_data_module(tokenizer,
 52 |                                 data_args) :
 53 |     """Make dataset and collator for supervised fine-tuning."""
 54 | 
 55 |     if data_args.dataset_config is not None:
 56 |         dataset_config = json.load(open(data_args.dataset_config))
 57 | 
 58 |     train_dataset = build_osprey_dataset(dataset_config,
 59 |                             tokenizer=tokenizer,
 60 |                             data_args=data_args)
 61 | 
 62 |     data_collator = DataCollatorForDetDataset(tokenizer=tokenizer)
 63 | 
 64 |     return dict(train_dataset=train_dataset,
 65 |                 eval_dataset=None,
 66 |                 data_collator=data_collator)
 67 | 
 68 | def build_osprey_dataset(dataset_config,
 69 |                   tokenizer=None,
 70 |                   data_args=None,
 71 |                   **kwargs):
 72 |     if isinstance(dataset_config, list):
 73 |         datasets = []
 74 |         for cfg in dataset_config:
 75 |             temp_dataset = build_osprey_dataset(cfg, tokenizer=tokenizer, data_args=data_args, **kwargs)
 76 |             datasets.append(temp_dataset)
 77 | 
 78 |         for dataset in datasets:
 79 |             print(type(dataset), f'len = {len(dataset)}')
 80 | 
 81 |         return ConcatDataset(datasets)
 82 |     
 83 |     dataset_type = dataset_config.pop('type')
 84 | 
 85 |     if dataset_type == 'coco_data':
 86 |         dataset = COCODataset(
 87 |             **dataset_config,
 88 |             tokenizer=tokenizer,
 89 |             data_args=data_args,
 90 |             **kwargs,
 91 |         )
 92 |     
 93 |     elif dataset_type == 'vcr':
 94 |         dataset = VCRDataset(
 95 |             **dataset_config,
 96 |             tokenizer=tokenizer,
 97 |             data_args=data_args,
 98 |             **kwargs,
 99 |         )
100 |     elif dataset_type == 'VGDATA':
101 |         dataset = VGDATA(
102 |             **dataset_config,
103 |             tokenizer=tokenizer,
104 |             data_args=data_args,
105 |             **kwargs,
106 |         )
107 |     elif dataset_type == 'RefCOCO':
108 |         dataset = RefCOCO(
109 |             **dataset_config,
110 |             tokenizer=tokenizer,
111 |             data_args=data_args,
112 |             **kwargs,
113 |         )
114 |     elif dataset_type == 'RefCOCOP':
115 |         dataset = RefCOCOP(
116 |             **dataset_config,
117 |             tokenizer=tokenizer,
118 |             data_args=data_args,
119 |             **kwargs,
120 |         )
121 |     elif dataset_type == 'PascalPart':
122 |         dataset = PascalPart(
123 |             **dataset_config,
124 |             tokenizer=tokenizer,
125 |             data_args=data_args,
126 |             **kwargs,
127 |         )
128 |     elif dataset_type == 'PartImagenet':
129 |         dataset = PartImagenet(
130 |             **dataset_config,
131 |             tokenizer=tokenizer,
132 |             data_args=data_args,
133 |             **kwargs,
134 |         )
135 |     elif dataset_type == 'OspreyDetailedDescription':
136 |         dataset = OspreyDetailedDescription(
137 |             **dataset_config,
138 |             tokenizer=tokenizer,
139 |             data_args=data_args,
140 |             **kwargs,
141 |         )
142 |     elif dataset_type == 'OspreyConversations':
143 |         dataset = OspreyConversations(
144 |             **dataset_config,
145 |             tokenizer=tokenizer,
146 |             data_args=data_args,
147 |             **kwargs,
148 |         )
149 |     elif dataset_type == 'OspreyShortForm':
150 |         dataset = OspreyShortForm(
151 |             **dataset_config,
152 |             tokenizer=tokenizer,
153 |             data_args=data_args,
154 |             **kwargs,
155 |         )
156 |     elif dataset_type == 'OspreyPartLevel':
157 |         dataset = OspreyPartLevel(
158 |             **dataset_config,
159 |             tokenizer=tokenizer,
160 |             data_args=data_args,
161 |             **kwargs,
162 |         )
163 |     elif dataset_type == 'OspreyLVISPosNeg':
164 |         dataset = OspreyLVISPosNeg(
165 |             **dataset_config,
166 |             tokenizer=tokenizer,
167 |             data_args=data_args,
168 |             **kwargs,
169 |         )
170 | 
171 |     else:
172 |         raise NotImplementedError
173 | 
174 |     return dataset
175 | 
176 | 
177 | 
178 | class ConcatDataset(ConcatDataset):
179 |     def __init__(self, datasets):
180 |         super().__init__(datasets)
181 | 
182 |     def collater(self, samples):
183 | 
184 |         all_keys = set()
185 |         for s in samples:
186 |             all_keys.update(s)
187 | 
188 |         shared_keys = all_keys
189 |         for s in samples:
190 |             shared_keys = shared_keys & set(s.keys())
191 | 
192 |         samples_shared_keys = []
193 |         for s in samples:
194 |             samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
195 | 
196 |         return self.datasets[0].collater(samples_shared_keys)
197 | 
198 | 


--------------------------------------------------------------------------------
/osprey/datasets/vcr.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code is largely based on https://github.com/jshilong/GPT4RoI/blob/main/gpt4roi/datasets/vcr.py
  3 | """
  4 | import copy
  5 | import json
  6 | import os
  7 | import random
  8 | from tkinter import N
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | from PIL import Image
 13 | from torch.utils.data import Dataset
 14 | from matplotlib import path
 15 | from matplotlib import pyplot as plt
 16 | from osprey.train.train import preprocess, preprocess_multimodal
 17 | 
 18 | WHY_QUESTIONS = [
 19 |     'why?',
 20 |     'why',
 21 |     "What's the rationale for your decision?",
 22 |     'What led you to that conclusion?',
 23 |     "What's the reasoning behind your opinion?",
 24 |     'Why do you believe that to be true?',
 25 |     'Can you explain the basis for your thinking?',
 26 |     'What factors influenced your perspective?',
 27 |     'How did you arrive at that perspective?',
 28 |     'What evidence supports your viewpoint?',
 29 |     'What makes you think that way?',
 30 |     "What's the logic behind your argument?",
 31 |     'Can you provide some context for your opinion?',
 32 |     "What's the basis for your assertion?",
 33 |     'Why do you hold that belief?',
 34 |     'What experiences have shaped your perspective?',
 35 |     'What assumptions underlie your reasoning?',
 36 |     "What's the foundation of your assertion?",
 37 |     "What's the source of your reasoning?",
 38 |     "What's the motivation behind your decision?",
 39 |     "What's the impetus for your belief?",
 40 |     "What's the driving force behind your conclusion?",
 41 |     'Why do you think that?',
 42 |     "What's your reasoning?",
 43 |     'What makes you say that?',
 44 |     'Why do you feel that way?',
 45 |     "What's the story behind that?",
 46 |     "What's your thought process?",
 47 |     "What's the deal with that?",
 48 |     "What's the logic behind it?",
 49 |     'Why do you believe that?',
 50 |     "What's the real deal here?",
 51 |     "What's the reason behind it?",
 52 |     "What's the thought process behind your decision?",
 53 |     "What's the rationale for your opinion?",
 54 |     'Why do you have that impression?',
 55 |     "What's the background to that?",
 56 |     "What's the evidence that supports your view?",
 57 |     "What's the explanation for that?"
 58 | ]
 59 | 
 60 | Ref_WAY = [
 61 |     'There are <region> in the image,',
 62 |     'There are some regions <region>,',
 63 |     'Given <region>,',
 64 |     'Given <region> in the image,',
 65 |     '<region>,',
 66 |     'Several regions <region> are in the image,',
 67 |     '<region> in the given image,'
 68 | ]
 69 | 
 70 | def _spaced_points(low, high,n):
 71 |     """ We want n points between low and high, but we don't want them to touch either side"""
 72 |     padding = (high-low)/(n*2)
 73 |     return np.linspace(low + padding, high-padding, num=n)
 74 | 
 75 | def make_mask(height, width, box, polygons_list):
 76 |     """
 77 |     Mask size: int about how big mask will be
 78 |     box: [x1, y1, x2, y2, conf.]
 79 |     polygons_list: List of polygons that go inside the box
 80 |     """
 81 |     mask = np.zeros((height, width), dtype=np.bool_)
 82 |     
 83 |     xy = np.meshgrid(_spaced_points(box[0], box[2], n=width),
 84 |                      _spaced_points(box[1], box[3], n=height)) 
 85 |     xy_flat = np.stack(xy, 2).reshape((-1, 2))
 86 | 
 87 |     for polygon in polygons_list:
 88 |         polygon_path = path.Path(polygon)
 89 |         mask |= polygon_path.contains_points(xy_flat).reshape((height, width))
 90 |     return mask.astype(np.float32)
 91 | 
 92 | class VCRDataset(Dataset):
 93 |     CLASSES = ('object',)
 94 | 
 95 |     def __init__(self,
 96 |                  tokenizer,
 97 |                  data_args=None,
 98 |                  ann_file=None,
 99 |                  img_prefix=None,
100 | 
101 |                  ):
102 |         super(VCRDataset, self).__init__()
103 | 
104 | 
105 |         self.img_prefix = img_prefix
106 | 
107 |         self.tokenizer = tokenizer
108 | 
109 |         self.data_args = data_args
110 | 
111 |         self.begin_str = """<image>.\nThis provides an overview of the picture.\n"""
112 |         self.data_infos = self.load_annotations(ann_file)
113 |         print('normal_vcr', len(self.data_infos))
114 | 
115 |     def load_annotations(self, ann_file):
116 | 
117 |         with open(ann_file, 'r') as f:
118 |           ann_list = [json.loads(line) for line in f]
119 |         data_infos = []
120 | 
121 |         import re
122 | 
123 |         def replace_numbers_with_tags(s, class_names):
124 |             pattern = r'\b(\d+)\b'
125 |             try:
126 |                 result = re.sub(pattern, lambda match: f'{class_names[int(match.group(1))]} at region{match.group(1)}', s)
127 |             except:
128 |                 # contain number not for instance
129 |                 return None
130 |             return result
131 | 
132 | 
133 |         for ann in ann_list:
134 | 
135 |             metadata_fn_path = ann['metadata_fn']
136 |             img_fn = ann['img_fn']
137 |             img_path = os.path.join(self.img_prefix,img_fn)
138 |             annotations = json.load(open(os.path.join(self.img_prefix, metadata_fn_path)))
139 |             masks = annotations['segms']
140 |             bboxes = np.array(annotations['boxes'])
141 | 
142 |             class_names = ann['objects']
143 |             num_objects = len(class_names)
144 |             ref_string = ''
145 |             for i in range(num_objects):
146 |                 ref_string = ref_string +  f'region{i+1} <mask><pos>' + ','
147 |             ref_string = ref_string[:-1]
148 |             ref_prefix = random.choice(Ref_WAY)
149 | 
150 |             begion_string = ref_prefix.replace('<region>', ref_string)
151 |             qa_s = []
152 | 
153 |             q = ann['question_orig']
154 |             q = replace_numbers_with_tags(q, class_names)
155 |             a = ann['answer_orig']
156 |             a = replace_numbers_with_tags(a, class_names)
157 |             why = ann['rationale_orig']
158 |             why = replace_numbers_with_tags(why, class_names)
159 |             if (q is None) or (a is None) or (why) is None:
160 |                 continue
161 | 
162 | 
163 |             qa_s.append({'from': 'human', 'value': begion_string + q})
164 |             qa_s.append({'from': 'gpt', 'value': a})
165 |             qa_s.append({'from': 'human', 'value': random.choice(WHY_QUESTIONS)})
166 |             qa_s.append({'from': 'gpt', 'value': why})
167 | 
168 |             data_infos.append(dict(
169 |                 img_path = img_path,
170 |                 bboxes = bboxes,
171 |                 masks = masks,
172 |                 labels= class_names,
173 |                 qas = qa_s)
174 |             )
175 | 
176 | 
177 |         return data_infos
178 | 
179 |     def __len__(self):
180 |         return len(self.data_infos)
181 | 
182 |     def __getitem__(self, i):
183 |         data_info = self.data_infos[i]
184 | 
185 |         img_path = data_info['img_path']
186 |         masks = data_info['masks']
187 |         bboxes = data_info['bboxes']
188 | 
189 |         qas = data_info['qas']
190 |         processor = self.data_args.image_processor
191 |         image = Image.open(img_path).convert('RGB')
192 |         w, h = image.size
193 |         # TODO ablation this
194 | 
195 |         image_file = img_path
196 | 
197 |         pred_masks = np.zeros((len(masks), h, w))
198 |         for i,mask in enumerate(masks):
199 | 
200 |             int_box =  [round(box) for box in bboxes[i][:-1]]
201 |             
202 |             height_ = int(int_box[3]-int_box[1])
203 |             width_ = int(int_box[2]-int_box[0])
204 |             box_mask = make_mask(height_, width_, bboxes[i], mask)
205 | 
206 |             pred_masks[i, int_box[1]:int_box[3], int_box[0]:int_box[2]] = box_mask
207 | 
208 |         image = processor.preprocess(image,
209 |                                      do_center_crop=False,
210 |                                      return_tensors='pt')['pixel_values'][0]
211 | 
212 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
213 |                                                 size=(512, 512),
214 |                                                 mode='bilinear',
215 |                                                 align_corners=False).squeeze(0)
216 | 
217 |         cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)  # FIXME: 16 is hardcoded patch size
218 |         qas = copy.deepcopy(qas)
219 |         qas[0]['value'] = self.begin_str + qas[0]['value']
220 | 
221 |         sources = preprocess_multimodal(
222 |             copy.deepcopy([qas]),
223 |             self.data_args, cur_token_len)
224 | 
225 |         data_dict = preprocess(
226 |             sources,
227 |             self.tokenizer,
228 |             has_image=True)
229 |         if isinstance(i, int):
230 |             data_dict = dict(input_ids=data_dict['input_ids'][0],
231 |                              labels=data_dict['labels'][0])
232 | 
233 |         data_dict['image'] = image
234 |         data_dict['masks'] = torch.Tensor(pred_masks)
235 | 
236 |         return data_dict
237 | 


--------------------------------------------------------------------------------
/osprey/datasets/vg.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import random
  3 | import os
  4 | import numpy as np
  5 | import torch
  6 | from .stage2_data import CustomDataset
  7 | from osprey.train.train import preprocess, preprocess_multimodal
  8 | 
  9 | LIMIT = " Answer the question using a short phrase."
 10 | QUESTIONS =  [
 11 |     'Give me a short description of <region>.',
 12 |     'Can you give me a short description of <region>?',
 13 |     'Can you provide me with a short description of the region in the picture marked by <region>?',
 14 |     "I'm curious about the region represented by <region> in the picture. Could you describe it in few words?",
 15 |     'What can you tell me about the region indicated by <region> in the image in few words?',
 16 |     "I'd like to know more about the area in the photo labeled <region>. Can you give me a concise description?",
 17 |     'Could you describe the region shown as <region> in the picture concisely?',
 18 |     'What can you give me about the region outlined by <region> in the photo?',
 19 |     'Please provide me with a brief description of the region marked with <region> in the image.',
 20 |     'Can you give me a brief introduction of the region labeled as <region> in the picture?',
 21 |     "I'm interested in knowing the region represented by <region> in the photo. Can you describe it in several words?",
 22 |     'What is the region outlined by <region> in the picture like? Could you give me a streamlined description?',
 23 |     'Can you provide me with a brief description of the region in the picture marked by <region>, please?',
 24 |     "I'm curious about the region represented by <region> in the picture. Could you describe it in few words, please?",
 25 |     'What can you tell me about the region indicated by <region> in the image?',
 26 |     "I'd like to know more about the area in the photo labeled <region>, please. Can you give me a simple description?",
 27 |     'Could you describe the region shown as <region> in the picture in several words?',
 28 |     'Please provide me with a simple description of the region marked with <region> in the image, please.',
 29 |     "I'm interested in learning more about the region represented by <region> in the photo. Can you describe it in few words, please?",
 30 |     'What is the region outlined by <region> in the picture like, please? Could you give me a simple and clear description?',
 31 |     'Please describe the region <region> in the image concisely.',
 32 |     'Can you offer a simple analysis of the region <region> in the image?',
 33 |     'Could tell me something about the region highlighted by <region> in the picture briefly?',
 34 |     'Can you share a simple rundown of the region denoted by <region> in the presented image?'
 35 | ]
 36 | 
 37 | class VGDATA(CustomDataset):
 38 |     def __init__(self,
 39 |                  tokenizer,
 40 |                  data_args=None,
 41 |                  ann_file=None,
 42 |                  img_prefix=None,
 43 |                  max_gt_per_img=3,
 44 |                  ):
 45 | 
 46 |         self.data_args = data_args
 47 |         self.tokenizer = tokenizer
 48 |         self.ann_file = ann_file
 49 |         self.img_prefix = img_prefix
 50 |         self.max_gt_per_img = max_gt_per_img
 51 | 
 52 |         super().__init__(tokenizer, data_args, ann_file, img_prefix, max_gt_per_img)
 53 | 
 54 |         self.begin_str = """<image>\nThis provides an overview of the picture.\n"""
 55 | 
 56 | 
 57 |     def get_data_item(self, idx):
 58 |         data_info = self.data_infos[idx]
 59 |         ann_info = self.get_ann_info(idx)
 60 | 
 61 |         img_path = os.path.join(self.img_prefix, data_info['filename'])
 62 |         image = self.read_process_image(img_path)
 63 | 
 64 |         gt_labels = []
 65 |         gt_masks_ann = []
 66 | 
 67 |         for i, ann in enumerate(ann_info):
 68 |             if ann.get('ignore', False):
 69 |                 continue
 70 |             mask = self.annToMask(ann['segmentation'], data_info['height'], data_info['width'])
 71 |             
 72 |             gt_labels.append(ann['caption'])
 73 |             gt_masks_ann.append(mask)
 74 | 
 75 | 
 76 |         data_item = dict(
 77 |             img = image,
 78 |             gt_labels=gt_labels,
 79 |             gt_masks=gt_masks_ann
 80 |         )
 81 |         return data_item
 82 | 
 83 |     
 84 |     def process_text(self, data_item):
 85 |         image = data_item['img']
 86 |         ori_labels = data_item['gt_labels']
 87 |         ori_masks = np.array(data_item['gt_masks'])
 88 |         ori_masks = torch.from_numpy(ori_masks) 
 89 | 
 90 |         shuffle_ids = torch.randperm(len(ori_labels))
 91 |         if len(shuffle_ids) > self.max_gt_per_img:
 92 |             shuffle_ids = shuffle_ids[:self.max_gt_per_img]
 93 |         ori_masks = ori_masks[shuffle_ids]
 94 |         ori_labels = [ori_labels[i] for i in shuffle_ids]
 95 | 
 96 |         sources = dict()
 97 | 
 98 |         sources['conversations'] = []
 99 | 
100 |         for i in range(len(ori_labels)):
101 |             question = random.choice(QUESTIONS).strip()
102 |             question = question.replace('<region>', '<mask><pos>')
103 |             if i == 0:
104 |                 question = self.begin_str + question
105 |             question += LIMIT
106 |             answer = ori_labels[i]
107 |             sources['conversations'].append(
108 |                 {'from': 'human', 'value': question})
109 |             sources['conversations'].append({'from': 'gpt', 'value': answer})
110 | 
111 |         cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
112 | 
113 |         sources = preprocess_multimodal(
114 |             copy.deepcopy([sources['conversations']]),
115 |             self.data_args,
116 |             cur_token_len)
117 |         # print(sources)
118 | 
119 |         data_dict = preprocess(
120 |             sources,
121 |             self.tokenizer,
122 |             has_image=True
123 |             )
124 |         
125 |         # get single
126 |         if isinstance(i, int):
127 |             data_dict = dict(input_ids=data_dict['input_ids'][0],
128 |                              labels=data_dict['labels'][0])
129 | 
130 |         data_dict['image'] = image
131 |         data_dict['masks'] = ori_masks
132 |         return data_dict
133 | 


--------------------------------------------------------------------------------
/osprey/eval/README.md:
--------------------------------------------------------------------------------
  1 | # Evaluation for Osprey 🔎
  2 | 
  3 | This document provides instructions on evaluating Osprey on four representative tasks, including open-vocabulary segmentation, referring object classification, detailed region description and region level captioning.
  4 | 
  5 |  We have developed two types of models：the first is [Osprey](https://huggingface.co/sunshine-lwt/Osprey-7b/tree/main)， the second  is [Osprey-Chat](https://huggingface.co/sunshine-lwt/Osprey-Chat-7b/tree/main)(denote `Osprey*` in our paper).  Osprey-Chat exhibits better conversation and  image-level understanding&reasoning capabilities with additional llava data(llava_v1_5_mix665k.json).
  6 | 
  7 | ## 1. Open-Vocabulary Segmentation
  8 | - Download [SentenceBERT model](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2), which is used for calculating the semantic similarity.
  9 | - The evaluation is based on `detectron2`, please install the following dependences.
 10 | ```
 11 | git clone https://github.com/facebookresearch/detectron2.git
 12 | python -m pip install -e detectron2
 13 | pip install git+https://github.com/cocodataset/panopticapi.git
 14 | pip install git+https://github.com/mcordts/cityscapesScripts.git
 15 | ```
 16 | - Prepare datasets, please refer to [Data preparation](./datasets/README.md).
 17 | 
 18 | ### Cityscapes
 19 | ```
 20 | cd osprey/eval
 21 | python eval_open_vocab_seg_detectron2.py --dataset cityscapes --model path/to/osprey-7b --bert path/to/all-MiniLM-L6-v2
 22 | ```
 23 | ### Ade20K
 24 | ```
 25 | cd osprey/eval
 26 | python eval_open_vocab_seg_detectron2.py --dataset ade --model path/to/osprey-7b --bert path/to/all-MiniLM-L6-v2
 27 | ```
 28 | 
 29 | 
 30 | <div align=center>
 31 | <img src="../../assets/table1.png" />
 32 | </div>
 33 | 
 34 | 
 35 | ## 2. Referring Object Classification
 36 | 
 37 | ### LVIS
 38 | - Download our generated [lvis_val_1k_category.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-ValData/resolve/main/lvis_val_1k_category.json?download=true) (We randomly sample 1K images with 4,004 objects from LVIS dataset.)
 39 | ```
 40 | cd osprey/eval
 41 | python lvis_paco_eval.py --model path/to/osprey-7b --bert path/to/all-MiniLM-L6-v2 --img path/to/coco-all-imgs --json lvis_val_1k_category.json
 42 | ```
 43 | ### PACO
 44 | - Download our generated [paco_val_1k_category.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-ValData/resolve/main/paco_val_1k_category.json?download=true) (We randomly sample 1K images with 4,263 objects from PACO dataset.)
 45 | ```
 46 | cd osprey/eval
 47 | python lvis_paco_eval.py --model path/to/osprey-7b --bert path/to/all-MiniLM-L6-v2 --img path/to/coco-all-imgs --json paco_val_1k_category.json
 48 | ```
 49 | 
 50 | <div align=center>
 51 | <img src="../../assets/table2.png" width="60%" />
 52 | </div>
 53 | 
 54 | ## 3. Detailed Region Description
 55 | - Fill in the gpt interface in `eval_gpt.py`.
 56 | - Change the path in `gpt_eval.sh`.
 57 | ```
 58 | cd osprey/eval
 59 | sh gpt_eval.sh
 60 | ```
 61 | 
 62 | <div align=center>
 63 | <img src="../../assets/table3.png" width="60%" />
 64 | </div>
 65 | 
 66 | ## 4. Ferret-Bench
 67 | 
 68 | Note that we have converted the boxes in `box_refer_caption.json` and `box_refer_reason.json` to polygon format denoted by `segmentation`.
 69 | ### Referring Description
 70 | 
 71 | ```
 72 | cd osprey/eval
 73 | python ferret_bench_eval.py --model_name path/to/osprey-chat-7b --root_path path/to/coco_imgs --json_path ./ferret_bench/box_refer_caption.json
 74 | ```
 75 | 
 76 | ### Referring Reasoning
 77 | 
 78 | ```
 79 | cd osprey/eval
 80 | python ferret_bench_eval.py --model_name path/to/osprey-chat-7b --root_path path/to/coco_imgs --json_path ./ferret_bench/box_refer_reason.json
 81 | ```
 82 | 
 83 | Then use GPT-4 to evaluate the result as in [Ferret](https://github.com/apple/ml-ferret).
 84 | 
 85 | <div align=center>
 86 | <img src="../../assets/table4.png" width="80%" />
 87 | </div>
 88 | 
 89 | ## 5. POPE
 90 | 
 91 | - Download coco from POPE and put under osprey/eval/pope.
 92 | - Change the path in `pope_eval.sh`.
 93 | 
 94 | ```python
 95 | cd osprey/eval
 96 | sh pope_eval.sh
 97 | ```
 98 | 
 99 | <div align=center>
100 | <img src="../../assets/table5.png" width="100%" />
101 | </div>
102 | 
103 | ## 6. Region Level Captioning
104 | 
105 | - We fine-tune Osprey-7B on training set of RefCOCOg. The fintuned model can be found in [Osprey-7B-refcocog-fintune](https://huggingface.co/sunshine-lwt/Osprey-7b-Refercocog-finetuning/tree/main).
106 | - Download [finetune_refcocog_val_with_mask.json](https://huggingface.co/datasets/sunshine-lwt/Osprey-ValData/resolve/main/finetune_refcocog_val_with_mask.json?download=true).
107 | - Generate output json files:
108 | 
109 | ```
110 | cd osprey/eval
111 | python refcocog_eval.py --model path/to/Osprey-7B-refcocog-fintune --img path/to/coco-all-imgs --json finetune_refcocog_val_with_mask.json
112 | ```
113 | - Finally, evaluate the output json file using `CaptionMetrics`.
114 | 
115 | <div align=center>
116 | <img src="../../assets/table6.png" width="60%" />
117 | </div>
118 | 


--------------------------------------------------------------------------------
/osprey/eval/datasets/README.md:
--------------------------------------------------------------------------------
 1 | # Data preparation for Open-Vocabulary Segmentation☕️
 2 | Dataset preparation follows [Detectron2](https://github.com/facebookresearch/detectron2/blob/main/datasets/README.md) and [Mask2Former](https://github.com/facebookresearch/Mask2Former/blob/main/datasets/README.md).
 3 | 
 4 | The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. Under this directory, detectron2 will look for datasets in the structure described below, if needed.
 5 | ```
 6 | $DETECTRON2_DATASETS/
 7 |   ADEChallengeData2016/
 8 |   cityscapes/
 9 | ```
10 | You can set the location for builtin datasets by export `DETECTRON2_DATASETS=/path/to/datasets`. The default is `./datasets` under the eval directory.
11 | 
12 | ## ADE20k (A-150)
13 | Dataset structure:
14 | ```
15 | ADEChallengeData2016/
16 |   images/
17 |   annotations/
18 |   objectInfo150.txt
19 |   # download instance annotation
20 |   annotations_instance/
21 |   # generated by prepare_ade20k_sem_seg.py
22 |   annotations_detectron2/
23 |   # below are generated by prepare_ade20k_pan_seg.py
24 |   ade20k_panoptic_{train,val}.json
25 |   ade20k_panoptic_{train,val}/
26 |   # below are generated by prepare_ade20k_ins_seg.py
27 |   ade20k_instance_{train,val}.json
28 | ```
29 | The directory annotations_detectron2 is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
30 | 
31 | Download the instance annotation from http://sceneparsing.csail.mit.edu/:
32 | ```
33 | wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
34 | ```
35 | Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
36 | 
37 | Finally, run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
38 | 
39 | ## Cityscapes
40 | Data structure:
41 | ```
42 | cityscapes/
43 |   gtFine/
44 |     train/
45 |       aachen/
46 |         color.png, instanceIds.png, labelIds.png, polygons.json,
47 |         labelTrainIds.png
48 |       ...
49 |     val/
50 |     test/
51 |     # below are generated Cityscapes panoptic annotation
52 |     cityscapes_panoptic_train.json
53 |     cityscapes_panoptic_train/
54 |     cityscapes_panoptic_val.json
55 |     cityscapes_panoptic_val/
56 |     cityscapes_panoptic_test.json
57 |     cityscapes_panoptic_test/
58 |   leftImg8bit/
59 |     train/
60 |     val/
61 |     test/
62 | ```
63 | Install cityscapes scripts by:
64 | ```
65 | pip install git+https://github.com/mcordts/cityscapesScripts.git
66 | ```
67 | 
68 | Note: to create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
69 | ```
70 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py
71 | ```
72 | 
73 | Note: to generate Cityscapes panoptic dataset, run cityscapesescript with:
74 | 
75 | ```
76 | CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesscripts/preparation/createPanopticImgs.py
77 | ```
78 | 


--------------------------------------------------------------------------------
/osprey/eval/datasets/ade20k_instance_catid_mapping.txt:
--------------------------------------------------------------------------------
  1 | Instacne100	SceneParse150	FullADE20K
  2 | 1		8		165
  3 | 2		9		3055
  4 | 3		11		350
  5 | 4		13		1831
  6 | 5		15		774
  7 | 5		15		783
  8 | 6		16		2684
  9 | 7		19		687
 10 | 8		20		471
 11 | 9		21		401
 12 | 10		23		1735
 13 | 11		24		2473
 14 | 12		25		2329
 15 | 13		28		1564
 16 | 14		31		57
 17 | 15		32		2272
 18 | 16		33		907
 19 | 17		34		724
 20 | 18		36		2985
 21 | 18		36		533
 22 | 19		37		1395
 23 | 20		38		155
 24 | 21		39		2053
 25 | 22		40		689
 26 | 23		42		266
 27 | 24		43		581
 28 | 25		44		2380
 29 | 26		45		491
 30 | 27		46		627
 31 | 28		48		2388
 32 | 29		50		943
 33 | 30		51		2096
 34 | 31		54		2530
 35 | 32		56		420
 36 | 33		57		1948
 37 | 34		58		1869
 38 | 35		59		2251
 39 | 36		63		239
 40 | 37		65		571
 41 | 38		66		2793
 42 | 39		67		978
 43 | 40		68		236
 44 | 41		70		181
 45 | 42		71		629
 46 | 43		72		2598
 47 | 44		73		1744
 48 | 45		74		1374
 49 | 46		75		591
 50 | 47		76		2679
 51 | 48		77		223
 52 | 49		79		47
 53 | 50		81		327
 54 | 51		82		2821
 55 | 52		83		1451
 56 | 53		84		2880
 57 | 54		86		480
 58 | 55		87		77
 59 | 56		88		2616
 60 | 57		89		246
 61 | 57		89		247
 62 | 58		90		2733
 63 | 59		91		14
 64 | 60		93		38
 65 | 61		94		1936
 66 | 62		96		120
 67 | 63		98		1702
 68 | 64		99		249
 69 | 65		103		2928
 70 | 66		104		2337
 71 | 67		105		1023
 72 | 68		108		2989
 73 | 69		109		1930
 74 | 70		111		2586
 75 | 71		112		131
 76 | 72		113		146
 77 | 73		116		95
 78 | 74		117		1563
 79 | 75		119		1708
 80 | 76		120		103
 81 | 77		121		1002
 82 | 78		122		2569
 83 | 79		124		2833
 84 | 80		125		1551
 85 | 81		126		1981
 86 | 82		127		29
 87 | 83		128		187
 88 | 84		130		747
 89 | 85		131		2254
 90 | 86		133		2262
 91 | 87		134		1260
 92 | 88		135		2243
 93 | 89		136		2932
 94 | 90		137		2836
 95 | 91		138		2850
 96 | 92		139		64
 97 | 93		140		894
 98 | 94		143		1919
 99 | 95		144		1583
100 | 96		145		318
101 | 97		147		2046
102 | 98		148		1098
103 | 99		149		530
104 | 100		150		954


--------------------------------------------------------------------------------
/osprey/eval/datasets/prepare_ade20k_ins_seg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: https://github.com/facebookresearch/Mask2Former/blob/main/datasets/prepare_ade20k_ins_seg.py
  3 | """
  4 | 
  5 | import glob
  6 | import json
  7 | import os
  8 | from collections import Counter
  9 | 
 10 | import numpy as np
 11 | import tqdm
 12 | from panopticapi.utils import IdGenerator, save_json
 13 | from PIL import Image
 14 | import pycocotools.mask as mask_util
 15 | 
 16 | 
 17 | if __name__ == "__main__":
 18 |     dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
 19 | 
 20 |     for name, dirname in [("train", "training"), ("val", "validation")]:
 21 |         image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
 22 |         instance_dir = os.path.join(
 23 |             dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
 24 |         )
 25 | 
 26 |         # img_id = 0
 27 |         ann_id = 1
 28 | 
 29 |         # json
 30 |         out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
 31 | 
 32 |         # json config
 33 |         instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
 34 |         with open(instance_config_file) as f:
 35 |             category_dict = json.load(f)["categories"]
 36 | 
 37 |         # load catid mapping
 38 |         # it is important to share category id for both instance and panoptic annotations
 39 |         mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
 40 |         with open(mapping_file) as f:
 41 |             map_id = {}
 42 |             for i, line in enumerate(f.readlines()):
 43 |                 if i == 0:
 44 |                     continue
 45 |                 ins_id, sem_id, _ = line.strip().split()
 46 |                 # shift id by 1 because we want it to start from 0!
 47 |                 # ignore_label becomes 255
 48 |                 map_id[int(ins_id)] = int(sem_id) - 1
 49 | 
 50 |         for cat in category_dict:
 51 |             cat["id"] = map_id[cat["id"]]
 52 | 
 53 |         filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
 54 | 
 55 |         ann_dict = {}
 56 |         images = []
 57 |         annotations = []
 58 | 
 59 |         for idx, filename in enumerate(tqdm.tqdm(filenames)):
 60 |             image = {}
 61 |             image_id = os.path.basename(filename).split(".")[0]
 62 | 
 63 |             image["id"] = image_id
 64 |             image["file_name"] = os.path.basename(filename)
 65 | 
 66 |             original_format = np.array(Image.open(filename))
 67 |             image["width"] = original_format.shape[1]
 68 |             image["height"] = original_format.shape[0]
 69 | 
 70 |             images.append(image)
 71 | 
 72 |             filename_instance = os.path.join(instance_dir, image_id + ".png")
 73 |             ins_seg = np.asarray(Image.open(filename_instance))
 74 |             assert ins_seg.dtype == np.uint8
 75 | 
 76 |             instance_cat_ids = ins_seg[..., 0]
 77 |             # instance id starts from 1!
 78 |             # because 0 is reserved as VOID label
 79 |             instance_ins_ids = ins_seg[..., 1]
 80 | 
 81 |             # process things
 82 |             for thing_id in np.unique(instance_ins_ids):
 83 |                 if thing_id == 0:
 84 |                     continue
 85 |                 mask = instance_ins_ids == thing_id
 86 |                 instance_cat_id = np.unique(instance_cat_ids[mask])
 87 |                 assert len(instance_cat_id) == 1
 88 | 
 89 |                 anno = {}
 90 |                 anno['id'] = ann_id
 91 |                 ann_id += 1
 92 |                 anno['image_id'] = image['id']
 93 |                 anno["iscrowd"] = int(0)
 94 |                 anno["category_id"] = int(map_id[instance_cat_id[0]])
 95 | 
 96 |                 inds = np.nonzero(mask)
 97 |                 ymin, ymax = inds[0].min(), inds[0].max()
 98 |                 xmin, xmax = inds[1].min(), inds[1].max()
 99 |                 anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
100 |                 # if xmax <= xmin or ymax <= ymin:
101 |                 #     continue
102 |                 rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
103 |                 rle["counts"] = rle["counts"].decode("utf-8")
104 |                 anno["segmentation"] = rle
105 |                 anno["area"] = int(mask_util.area(rle))
106 |                 annotations.append(anno)
107 | 
108 |         # save this
109 |         ann_dict['images'] = images
110 |         ann_dict['categories'] = category_dict
111 |         ann_dict['annotations'] = annotations
112 | 
113 |         save_json(ann_dict, out_file)


--------------------------------------------------------------------------------
/osprey/eval/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reference: https://github.com/facebookresearch/Mask2Former/blob/main/datasets/prepare_ade20k_ins_seg.py
 3 | """
 4 | 
 5 | import os
 6 | from pathlib import Path
 7 | 
 8 | import numpy as np
 9 | import tqdm
10 | from PIL import Image
11 | 
12 | 
13 | def convert(input, output):
14 |     img = np.asarray(Image.open(input))
15 |     assert img.dtype == np.uint8
16 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
17 |     Image.fromarray(img).save(output)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     dataset_dir = (
22 |         Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ade" / "ADEChallengeData2016"
23 |     )
24 |     for name in ["training", "validation"]:
25 |         annotation_dir = dataset_dir / "annotations" / name
26 |         output_dir = dataset_dir / "annotations_detectron2" / name
27 |         output_dir.mkdir(parents=True, exist_ok=True)
28 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
29 |             output_file = output_dir / file.name
30 |             convert(file, output_file)


--------------------------------------------------------------------------------
/osprey/eval/eval_gpt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/eval_gpt_review.py
  3 | """
  4 | 
  5 | import argparse
  6 | import json
  7 | import os
  8 | 
  9 | import openai
 10 | import time
 11 | from tqdm import tqdm
 12 | import requests
 13 | 
 14 | def get_eval(content: str, max_tokens: int):
 15 |     while True:
 16 |         try:
 17 |             messages=[{
 18 |                     'role': 'system',
 19 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 20 |                 }, {
 21 |                     'role': 'user',
 22 |                     'content': content,
 23 |                 }]
 24 |             ##########
 25 | 
 26 |             # change youre gpt interface here
 27 |             # ret = gpt_answer
 28 | 
 29 |             ##########
 30 |             break
 31 | 
 32 |         except openai.error.RateLimitError:
 33 |             pass
 34 |         except Exception as e:
 35 |             print(e)
 36 |         time.sleep(1)
 37 | 
 38 |     return ret
 39 | 
 40 | 
 41 | def parse_score(review):
 42 |     try:
 43 |         score_pair = review.split('\n')[0]
 44 |         score_pair = score_pair.replace(',', ' ')
 45 |         sp = score_pair.split(' ')
 46 |         if len(sp) == 2:
 47 |             return [float(sp[0]), float(sp[1])]
 48 |         else:
 49 |             print('error', review)
 50 |             return [-1, -1]
 51 |     except Exception as e:
 52 |         print(e)
 53 |         print('error', review)
 54 |         return [-1, -1]
 55 | 
 56 | 
 57 | if __name__ == '__main__':
 58 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 59 |     parser.add_argument('--question', help='path to question file')
 60 |     parser.add_argument('--context', help='path to gpt prompt file')
 61 |     parser.add_argument('--answer-list', nargs='+', default=[], help='gpt answer and model answer json files')
 62 |     parser.add_argument('--rule', help='gpt rule')
 63 |     parser.add_argument('--output', help='output json dir')
 64 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 65 |     args = parser.parse_args()
 66 | 
 67 |     f_q = json.load(open(os.path.expanduser(args.question)))
 68 |     f_ans1 = json.load(open(os.path.expanduser(args.answer_list[0])))
 69 |     f_ans2 = json.load(open(os.path.expanduser(args.answer_list[1])))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     os.makedirs('./result', exist_ok=True)
 73 | 
 74 |     if os.path.isfile(os.path.expanduser(args.output)):
 75 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 76 |     else:
 77 |         cur_reviews = []
 78 | 
 79 |     review_file = open(f'{args.output}', 'a')
 80 | 
 81 |     context_list = json.load(open(os.path.expanduser(args.context)))
 82 |     
 83 |     image_to_context = {context['image']: context for context in context_list}
 84 | 
 85 |     handles = []
 86 |     idx = 0
 87 |     
 88 |     for ques, ans1, ans2 in tqdm(zip(f_q, f_ans1, f_ans2)):
 89 | 
 90 |         inst = image_to_context[ques['image']]
 91 | 
 92 |         category = ques['category']
 93 |         if category in rule_dict:
 94 |             rule = rule_dict[category]
 95 |         else:
 96 |             assert False, f"category not found in rule file: {category}."
 97 |                     
 98 |         prompt = rule['prompt']
 99 |         role = rule['role']
100 |         content = (f'[Context]\{inst["prompt"]}\n\n'
101 |                    f'[Question]\n{ques["text"]}\n\n'
102 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
103 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
104 |                    f'[System]\n{prompt}\n\n')
105 |         
106 |         cur_js = {
107 |             'id': idx+1,
108 |             'question_id': ques['question_id'],
109 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
110 |             'answer2_id': ans2.get('answer_id', ans2['question_id']),
111 |             'category': category
112 |         }
113 |         if idx >= len(cur_reviews):
114 |             review = get_eval(content, args.max_tokens)
115 |             print(review)
116 |  
117 |             scores = parse_score(review)
118 |             cur_js['content'] = review
119 |             cur_js['tuple'] = scores
120 |             cur_js['answer1'] = ans1["text"]
121 |             cur_js['answer2'] = ans2["text"]
122 |             review_file.write(json.dumps(cur_js) + '\n')
123 |             review_file.flush()
124 |         else:
125 |             print(f'Skipping {idx} as we already have it.')
126 | 
127 |         idx += 1
128 |         print(idx)
129 |         
130 |     review_file.close()


--------------------------------------------------------------------------------
/osprey/eval/ferret_bench_eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | from torch import nn
  8 | import copy
  9 | from functools import partial
 10 | from transformers import AutoTokenizer, CLIPImageProcessor
 11 | from osprey.constants import IMAGE_TOKEN_INDEX
 12 | from osprey.conversation import conv_templates, SeparatorStyle
 13 | from osprey.utils import disable_torch_init
 14 | from osprey.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
 15 | from osprey.train.train import preprocess, preprocess_multimodal
 16 | from osprey.train.train import DataArguments
 17 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
 18 | from pycocotools import mask as maskUtils
 19 | import numpy as np
 20 | from PIL import Image
 21 | import cv2
 22 | import re
 23 | data_args = DataArguments()
 24 | data_args.mm_use_im_start_end = False
 25 | data_args.is_multimodal = True
 26 | 
 27 | def annToMask(ann, h, w):
 28 |     rles = maskUtils.frPyObjects(ann, h, w)
 29 |     rle = maskUtils.merge(rles)
 30 |     m = maskUtils.decode(rle)
 31 |     return m
 32 | 
 33 | class GPT_EVAL(nn.Module):
 34 |     def __init__(self, model_path, model_base=None):
 35 |         super().__init__()
 36 |         disable_torch_init()
 37 |         model_path = os.path.expanduser(model_path)
 38 | 
 39 |         self.tokenizer = AutoTokenizer.from_pretrained(
 40 |             model_path,
 41 |             model_max_length=2048,
 42 |             padding_side="right",
 43 |             use_fast=True
 44 |         )
 45 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 46 |                                                 model_path,
 47 |                                                 torch_dtype=torch.bfloat16,
 48 |                                                 ).cuda()
 49 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 50 | 
 51 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 52 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 53 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 54 |         
 55 |         spi_tokens = ['<mask>', '<pos>']
 56 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 57 |         
 58 |         for m in self.model.modules():
 59 |             m.tokenizer = self.tokenizer
 60 | 
 61 |         vision_tower = self.model.get_vision_tower()
 62 |         if not vision_tower.is_loaded:
 63 |             vision_tower.load_model()
 64 |         vision_tower.to(dtype=torch.float16, device='cuda')
 65 |         
 66 | 
 67 |     def forward(self, root_path, ann_file):
 68 |         final = []
 69 |         anns = json.load(open(ann_file))
 70 |         
 71 |         for i, ann in enumerate(anns):
 72 |             print(i)
 73 | 
 74 |             model_answer = {}
 75 |             model_answer["question_id"] = ann["question_id"]
 76 |             model_answer["image"] = ann["image"]
 77 |             model_answer["category"] = ann["category"]
 78 |             img_path = os.path.join(root_path, ann['image'])
 79 |             img = cv2.imread(img_path)
 80 |             question = ann['text']
 81 |         
 82 |             question = re.sub(r'<region>', r'<mask><pos>', question)
 83 |             # question += 'Answer the question in detail.'
 84 |             idx = 1
 85 |          
 86 |             mask_r = ann['annotation']['segmentation']
 87 |             height, width = img.shape[:2]
 88 | 
 89 |             if isinstance(mask_r, list):
 90 |                 mask = annToMask(mask_r, height, width)
 91 |             else:
 92 |                 mask = maskUtils.decode(mask_r)
 93 |             mask = torch.from_numpy(mask).unsqueeze(0)
 94 | 
 95 |             x1, y1, w, h = ann['annotation']['bbox']
 96 |             bbox = np.array([x1, y1, x1 + w, y1 + h])
 97 |             bbox = torch.from_numpy(bbox)
 98 | 
 99 |         
100 |             init_inputs = get_init_inputs(img_path,
101 |                                         self.image_processor,
102 |                                         self.tokenizer,
103 |                                         pred_bboxes=bbox,
104 |                                         mask=mask,
105 |                                         question=question,
106 |                                         round_ids=0,
107 |                                         last_round_source={},
108 |                                         )
109 | 
110 |             masks = init_inputs['masks'].cuda()
111 |             image = init_inputs['image']
112 |             conv = conv_templates['osprey_v1'].copy()
113 |             qs = init_inputs['sources'][0][0]['value']
114 |             conv.append_message(conv.roles[0], qs)
115 |             conv.append_message(conv.roles[1], None)
116 |             prompt = conv.get_prompt()
117 | 
118 |             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
119 | 
120 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
121 |             keywords = [stop_str]
122 |             stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
123 | 
124 |             self.model.model.tokenizer = self.tokenizer
125 | 
126 | 
127 |             with torch.inference_mode():
128 | 
129 |                 self.model.orig_forward = self.model.forward
130 |                 self.model.forward = partial(self.model.orig_forward,
131 |                                              img_metas=[None],
132 |                                              masks=[masks.half()])
133 | 
134 |                 output_ids = self.model.generate(
135 |                     input_ids,
136 |                     images=image.unsqueeze(0).half().cuda(),
137 |                     do_sample=True,
138 |                     # masks=[masks.half()],
139 |                     temperature=0.2,
140 |                     max_new_tokens=1024,
141 |                     use_cache=True,
142 |                     num_beams=1,
143 |                     # stopping_criteria=[stopping_criteria]
144 |                     )
145 | 
146 |                 self.model.forward = self.model.orig_forward
147 | 
148 |             input_token_len = input_ids.shape[1]
149 |             n_diff_input_output = (
150 |                 input_ids != output_ids[:, :input_token_len]).sum().item()
151 |             if n_diff_input_output > 0:
152 |                 print(
153 |                     f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
154 |             outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
155 |                                                   skip_special_tokens=True)[0]
156 | 
157 |             outputs = outputs.strip()
158 |             if outputs.endswith(stop_str):
159 |                 outputs = outputs[:-len(stop_str)]
160 |             outputs = outputs.strip()
161 | 
162 |             model_answer['text'] = outputs
163 |             
164 |             print(outputs)
165 |             final.append(model_answer)
166 | 
167 |         final_ = json.dumps(final)
168 |         with open('ferret_bench/osprey_refer_reason_original_3.json','w') as fw:
169 |             fw.write(final_)
170 |             fw.close()
171 |      
172 | 
173 | def get_init_inputs(img_path,
174 |                     processor,
175 |                     tokenizer,
176 |                     pred_bboxes,
177 |                     mask,
178 |                     question=None,
179 |                     round_ids=0,
180 |                     last_round_source=None):
181 | 
182 |     if round_ids == 0:
183 |        
184 |         image = Image.open(img_path).convert('RGB')
185 | 
186 |         image = processor.preprocess(image,
187 |                                      do_center_crop=False,
188 |                                      return_tensors='pt')['pixel_values'][0]
189 | 
190 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
191 |                                                 size=(512, 512),
192 |                                                 mode='bilinear',
193 |                                                 align_corners=False).squeeze(0)
194 | 
195 |     else:
196 |         image = last_round_source['image']
197 | 
198 |     cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
199 | 
200 |     mask = mask.to(image.device)
201 | 
202 |     begin_str = """<image>.\nThis provides an overview of the picture.\n"""
203 | 
204 |     sources = dict()
205 |     sources['conversations'] = []
206 | 
207 |     sources['conversations'].append({'from': 'human', 'value': begin_str+question})
208 |     
209 |     sources = preprocess_multimodal([sources['conversations']], data_args, cur_token_len)
210 | 
211 |     data_dict = {}
212 |     data_dict['sources'] = sources
213 |     data_dict['image'] = image
214 |     data_dict['bboxes'] = pred_bboxes
215 |     data_dict['masks'] = mask
216 |     data_dict['img_metas'] = dict(filename=img_path)
217 | 
218 |     return data_dict
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     model_name = '/Osprey-Chat-7b'
223 |     root_path = '/path/to/coco-imgs'
224 |     json_path = './ferret_bench/refer_reason/box_refer_reason.json'
225 |     ferret_eval = GPT_EVAL(model_name)
226 |     ferret_eval(root_path, json_path)
227 | 
228 | 


--------------------------------------------------------------------------------
/osprey/eval/gpt_eval.sh:
--------------------------------------------------------------------------------
 1 | NAME='osprey'
 2 | TYPE='description'
 3 | 
 4 | python osprey_generate_gpt_description_answer.py\
 5 |     --model /path/to/osprey_7b\
 6 |     --coco-img /path/to/coco_imgs\
 7 |     --json ${TYPE}/questions.json
 8 | 
 9 | python eval_gpt.py\
10 |     --question ${TYPE}/questions.json\
11 |     --context ${TYPE}/prompt.json\
12 |     --answer-list ${TYPE}/answers.json\
13 |                   ${TYPE}/${NAME}_answer.json\
14 |     --rule rule.json\
15 |     --output result/gpt_score_${NAME}_${TYPE}.jsonl
16 | 
17 | python summarize_gpt_score.py --dir result
18 | 
19 | 


--------------------------------------------------------------------------------
/osprey/eval/lvis_paco_eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | from functools import partial
  7 | from transformers import AutoTokenizer, CLIPImageProcessor
  8 | from osprey.constants import IMAGE_TOKEN_INDEX
  9 | from osprey.conversation import conv_templates, SeparatorStyle
 10 | from osprey.mm_utils import tokenizer_image_token
 11 | from osprey.train.train import preprocess_multimodal
 12 | from osprey.train.train import DataArguments
 13 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
 14 | from pycocotools import mask as maskUtils
 15 | import numpy as np
 16 | from PIL import Image
 17 | from sentence_transformers import SentenceTransformer, util
 18 | import argparse
 19 | 
 20 | data_args = DataArguments()
 21 | data_args.mm_use_im_start_end = False
 22 | data_args.is_multimodal = True
 23 | 
 24 | def annToMask(ann, h, w):
 25 |     rles = maskUtils.frPyObjects(ann, h, w)
 26 |     rle = maskUtils.merge(rles)
 27 |     m = maskUtils.decode(rle)
 28 |     return m
 29 | 
 30 | class LVIS_PACO_EVAL():
 31 |     def __init__(self, model_path, bert_model):
 32 |         model_path = os.path.expanduser(model_path)
 33 | 
 34 |         self.tokenizer = AutoTokenizer.from_pretrained(
 35 |             model_path,
 36 |             model_max_length=2048,
 37 |             padding_side="right",
 38 |             use_fast=True
 39 |         )
 40 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 41 |                                                 model_path,
 42 |                                                 torch_dtype=torch.bfloat16,
 43 |                                                 ).cuda()
 44 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 45 | 
 46 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 47 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 48 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 49 |         
 50 |         spi_tokens = ['<mask>', '<pos>']
 51 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 52 |         
 53 |         for m in self.model.modules():
 54 |             m.tokenizer = self.tokenizer
 55 | 
 56 |         vision_tower = self.model.get_vision_tower()
 57 |         if not vision_tower.is_loaded:
 58 |             vision_tower.load_model()
 59 |         vision_tower.to(dtype=torch.float16, device='cuda')
 60 |         
 61 |         self.bert_model = SentenceTransformer(bert_model)
 62 | 
 63 | 
 64 |     def eval(self, root_path, ann_file):
 65 |         data_all = json.load(open(ann_file))
 66 |         all_sim = 0
 67 |         all_num = 0
 68 |         all_iou = 0
 69 |         for data in tqdm(data_all):
 70 |             img_path = os.path.join(root_path, data['file_name'])
 71 |             height = data['height']
 72 |             width = data['width']
 73 |             round_ids = 0
 74 |             last_source = dict()
 75 |             for i in range(len(data['categories'])):
 76 |                 category = data['categories'][i].replace('_', ' ')
 77 |                 category = category.replace(':', ' ')
 78 | 
 79 |                 mask_r = data['annotations'][i]['segmentation']
 80 | 
 81 |                 if isinstance(mask_r, list):
 82 |                     mask = annToMask(mask_r, height, width)
 83 |                 else:
 84 |                     mask = maskUtils.decode(mask_r)
 85 |                 mask = torch.from_numpy(mask).unsqueeze(0)
 86 |  
 87 |                 init_inputs = get_init_inputs(img_path,
 88 |                                             self.image_processor,
 89 |                                             mask=mask,
 90 |                                             round_ids=round_ids,
 91 |                                             last_round_source=last_source,
 92 |                                             )
 93 | 
 94 |                 round_ids += 1
 95 |                 last_source = init_inputs
 96 | 
 97 |                 image = init_inputs['image']
 98 |                 masks = init_inputs['masks'].cuda()
 99 | 
100 |                 conv = conv_templates['osprey_v1'].copy()
101 |                 qs = init_inputs['sources'][0][0]['value']
102 |   
103 |                 conv.append_message(conv.roles[0], qs)
104 |                 conv.append_message(conv.roles[1], None)
105 |                 prompt = conv.get_prompt()
106 | 
107 |                 input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
108 | 
109 |                 stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
110 | 
111 |                 self.model.model.tokenizer = self.tokenizer
112 | 
113 |                 with torch.inference_mode():
114 | 
115 |                     self.model.orig_forward = self.model.forward
116 |                     self.model.forward = partial(self.model.orig_forward,
117 |                                                 img_metas=[None],
118 |                                                 masks=[masks.half()])
119 | 
120 |                     output_ids = self.model.generate(
121 |                         input_ids,
122 |                         images=image.unsqueeze(0).half().cuda(),
123 |                         do_sample=True,
124 |                         temperature=0.2,
125 |                         max_new_tokens=1024,
126 |                         use_cache=True,
127 |                         num_beams=1,
128 |                     )
129 | 
130 |                     self.model.forward = self.model.orig_forward
131 | 
132 |                 input_token_len = input_ids.shape[1]
133 |                 n_diff_input_output = (
134 |                     input_ids != output_ids[:, :input_token_len]).sum().item()
135 |                 if n_diff_input_output > 0:
136 |                     print(
137 |                         f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
138 |                 outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
139 |                                                     skip_special_tokens=True)[0]
140 | 
141 |                 outputs = outputs.strip()
142 |                 if outputs.endswith(stop_str):
143 |                     outputs = outputs[:-len(stop_str)]
144 |                 outputs = outputs.strip()
145 |                 if ':' in outputs:
146 |                     outputs = outputs.split(':')[1]
147 | 
148 |                 outputs = outputs.replace('.', ' ')
149 |                 outputs = outputs.replace(':', ' ')
150 |                 outputs = outputs.replace(',', ' ')
151 |             
152 |                 print("[prediction]: ", outputs)
153 |                 print("[gt category]:", category)
154 | 
155 |                 outputs_embeddings = self.bert_model.encode(outputs, convert_to_tensor=True)
156 |                 class_sentence_embeddings = self.bert_model.encode(category, convert_to_tensor=True)
157 |                 cosine_scores = util.cos_sim(outputs_embeddings, class_sentence_embeddings)
158 | 
159 |                 semantic_iou = SemanticIOU(outputs.lower(), category.lower())
160 | 
161 |                 all_sim += cosine_scores[0][0]
162 |                 all_iou += semantic_iou
163 |                 all_num += 1
164 |                 
165 |             print("sim:{}, iou:{}".format(all_sim/all_num, all_iou/all_num))
166 |             
167 |         print("final sim:{}, semantic iou:{}".format(all_sim/all_num, all_iou/all_num))
168 |          
169 | 
170 | def SemanticIOU(value: list[str], target: list[str]) -> None:
171 | 
172 |     intersection = len(set(value.split()) & set(target.split()))
173 |     union = len(set(value.split()) | set(target.split()))
174 | 
175 |     return intersection / union
176 | 
177 | def get_init_inputs(img_path,
178 |                     processor,
179 |                     mask,
180 |                     round_ids=0,
181 |                     last_round_source=None):
182 | 
183 |     if round_ids == 0:
184 |        
185 |         image = Image.open(img_path).convert('RGB')
186 | 
187 |         image = processor.preprocess(image,
188 |                                      do_center_crop=False,
189 |                                      return_tensors='pt')['pixel_values'][0]
190 | 
191 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
192 |                                                 size=(512, 512),
193 |                                                 mode='bilinear',
194 |                                                 align_corners=False).squeeze(0)
195 | 
196 |     else:
197 |         image = last_round_source['image']
198 | 
199 |     cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
200 | 
201 |     mask = mask.to(image.device)
202 | 
203 |     begin_str = """<image>\nThis provides an overview of the picture.\n"""
204 | 
205 |     sources = dict()
206 |     sources['conversations'] = []
207 | 
208 |     question = 'What is the category of <mask><pos>? Using only one word or phrase.'
209 | 
210 |     sources['conversations'].append({'from': 'human', 'value': begin_str+question})
211 |     
212 |     sources = preprocess_multimodal([sources['conversations']], data_args, cur_token_len)
213 | 
214 |     data_dict = {}
215 |     data_dict['sources'] = sources
216 |     data_dict['image'] = image
217 |     data_dict['masks'] = mask
218 |     return data_dict
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     parser = argparse.ArgumentParser(description='osprey demo', formatter_class=argparse.RawTextHelpFormatter)
223 |     parser.add_argument('--model', help='path to osprey model', default='/path/to/osprey-7b')
224 |     parser.add_argument('--bert', help='path to bert model', default='./all-MiniLM-L6-v2')
225 |     parser.add_argument('--img', help='path to coco imgs', default='/path/to/all_coco_imgs')
226 |     parser.add_argument('--json', help='path to lvis/paco val json file', default='./paco_val_1k_category.json')
227 |     args = parser.parse_args()
228 | 
229 |     lvis_paco_eval = LVIS_PACO_EVAL(args.model, args.bert)
230 |     lvis_paco_eval.eval(args.img, args.json)
231 | 
232 | 


--------------------------------------------------------------------------------
/osprey/eval/osprey_generate_gpt_description_answer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from functools import partial
  6 | from transformers import AutoTokenizer, CLIPImageProcessor
  7 | from osprey.constants import IMAGE_TOKEN_INDEX
  8 | from osprey.conversation import conv_templates, SeparatorStyle
  9 | from osprey.mm_utils import tokenizer_image_token
 10 | from osprey.train.train import preprocess_multimodal
 11 | from osprey.train.train import DataArguments
 12 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
 13 | from pycocotools import mask as maskUtils
 14 | import numpy as np
 15 | from PIL import Image
 16 | import cv2
 17 | 
 18 | data_args = DataArguments()
 19 | data_args.mm_use_im_start_end = False
 20 | data_args.is_multimodal = True
 21 | 
 22 | def annToMask(ann, h, w):
 23 |     rles = maskUtils.frPyObjects(ann, h, w)
 24 |     rle = maskUtils.merge(rles)
 25 |     m = maskUtils.decode(rle)
 26 |     return m
 27 | 
 28 | class GPT_EVAL():
 29 |     def __init__(self, model_path):
 30 |         model_path = os.path.expanduser(model_path)
 31 | 
 32 |         self.tokenizer = AutoTokenizer.from_pretrained(
 33 |             model_path,
 34 |             model_max_length=2048,
 35 |             padding_side="right",
 36 |             use_fast=True
 37 |         )
 38 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 39 |                                                 model_path,
 40 |                                                 torch_dtype=torch.bfloat16,
 41 |                                                 ).cuda()
 42 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 43 | 
 44 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 45 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 46 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 47 |         
 48 |         spi_tokens = ['<mask>', '<pos>']
 49 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 50 |         
 51 |         for m in self.model.modules():
 52 |             m.tokenizer = self.tokenizer
 53 | 
 54 |         vision_tower = self.model.get_vision_tower()
 55 |         if not vision_tower.is_loaded:
 56 |             vision_tower.load_model()
 57 |         vision_tower.to(dtype=torch.float16, device='cuda')
 58 |         
 59 | 
 60 |     def eval(self, root_path, ann_file):
 61 |         final = []
 62 |         anns = json.load(open(ann_file))
 63 |         
 64 |         for i, ann in enumerate(anns):
 65 |             print(i)
 66 | 
 67 |             model_answer = {}
 68 |             model_answer["question_id"] = ann["question_id"]
 69 |             model_answer["image"] = ann["image"]
 70 |             model_answer["category"] = ann["category"]
 71 |             img_path = os.path.join(root_path, ann["image"])
 72 |             img = cv2.imread(img_path)
 73 |             question = ann["text"]
 74 |          
 75 |             mask_r = ann['annotation']['segmentation']
 76 |             height, width = img.shape[:2]
 77 | 
 78 |             if isinstance(mask_r, list):
 79 |                 mask = annToMask(mask_r, height, width)
 80 |             else:
 81 |                 mask = maskUtils.decode(mask_r)
 82 |             mask = torch.from_numpy(mask).unsqueeze(0)
 83 | 
 84 |             x1, y1, w, h = ann['annotation']['bbox']
 85 |             bbox = np.array([x1, y1, x1 + w, y1 + h])
 86 |             bbox = torch.from_numpy(bbox)
 87 | 
 88 |             init_inputs = get_init_inputs(img_path,
 89 |                                         self.image_processor,
 90 |                                         mask=mask,
 91 |                                         question=question,
 92 |                                         round_ids=0,
 93 |                                         last_round_source={},
 94 |                                         )
 95 | 
 96 |             masks = init_inputs['masks'].cuda()
 97 |             image = init_inputs['image']
 98 |             conv = conv_templates['osprey_v1'].copy()
 99 |             qs = init_inputs['sources'][0][0]['value']
100 | 
101 |             conv.append_message(conv.roles[0], qs)
102 |             conv.append_message(conv.roles[1], None)
103 |             prompt = conv.get_prompt()
104 | 
105 |             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
106 | 
107 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
108 | 
109 |             self.model.model.tokenizer = self.tokenizer
110 | 
111 | 
112 |             with torch.inference_mode():
113 | 
114 |                 self.model.orig_forward = self.model.forward
115 |                 self.model.forward = partial(self.model.orig_forward,
116 |                                              img_metas=[None],
117 |                                              masks=[masks.half()])
118 | 
119 |                 output_ids = self.model.generate(
120 |                     input_ids,
121 |                     images=image.unsqueeze(0).half().cuda(),
122 |                     do_sample=True,
123 |                     temperature=0.2,
124 |                     max_new_tokens=1024,
125 |                     use_cache=True,
126 |                     num_beams=1,
127 |                 )
128 | 
129 |                 self.model.forward = self.model.orig_forward
130 | 
131 |             input_token_len = input_ids.shape[1]
132 |             n_diff_input_output = (
133 |                 input_ids != output_ids[:, :input_token_len]).sum().item()
134 |             if n_diff_input_output > 0:
135 |                 print(
136 |                     f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
137 |             outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
138 |                                                   skip_special_tokens=True)[0]
139 | 
140 |             outputs = outputs.strip()
141 |             if outputs.endswith(stop_str):
142 |                 outputs = outputs[:-len(stop_str)]
143 |             outputs = outputs.strip()
144 |             if ':' in outputs:
145 |                 outputs = outputs.split(':')[1]
146 | 
147 |             model_answer['text'] = outputs
148 |             
149 |             print(outputs)
150 |             final.append(model_answer)
151 | 
152 |         final_ = json.dumps(final)
153 |         with open('description/osprey_answer.json','w') as fw:
154 |             fw.write(final_)
155 |             fw.close()
156 |      
157 | 
158 | def get_init_inputs(img_path,
159 |                     processor,
160 |                     mask,
161 |                     question=None,
162 |                     round_ids=0,
163 |                     last_round_source=None):
164 | 
165 |     if round_ids == 0:
166 |        
167 |         image = Image.open(img_path).convert('RGB')
168 | 
169 |         image = processor.preprocess(image,
170 |                                      do_center_crop=False,
171 |                                      return_tensors='pt')['pixel_values'][0]
172 | 
173 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
174 |                                                 size=(512, 512),
175 |                                                 mode='bilinear',
176 |                                                 align_corners=False).squeeze(0)
177 | 
178 |     else:
179 |         image = last_round_source['image']
180 | 
181 |     cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
182 | 
183 |     mask = mask.to(image.device)
184 | 
185 |     begin_str = """<image>.\nThis provides an overview of the picture.\n"""
186 | 
187 |     sources = dict()
188 |     sources['conversations'] = []
189 |     question = question.replace('<region>','<mask><pos>')
190 | 
191 |     sources['conversations'].append({'from': 'human', 'value': begin_str+question})
192 |     
193 |     sources = preprocess_multimodal([sources['conversations']], data_args, cur_token_len)
194 |    
195 |     data_dict = {}
196 |     data_dict['sources'] = sources
197 |     data_dict['image'] = image
198 |     data_dict['masks'] = mask
199 | 
200 |     return data_dict
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     parser = argparse.ArgumentParser(description='osprey generate gpt answer', formatter_class=argparse.RawTextHelpFormatter)
205 |     parser.add_argument('--model', help='path to osprey model', default='/path/to/osprey-7b')
206 |     parser.add_argument('--coco-img', help='path to coco imgs', default='/path/to/coco_all_imgs/')
207 |     parser.add_argument('--json', help='path to question json file', default='./description/questions.json')
208 |     args = parser.parse_args()
209 | 
210 |     gpt_eval = GPT_EVAL(args.model)
211 |     gpt_eval.eval(args.coco_img, args.json)
212 | 


--------------------------------------------------------------------------------
/osprey/eval/pope/evaluate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='evaluate pope', formatter_class=argparse.RawTextHelpFormatter)
 5 | parser.add_argument('--ans-file', default='pope/coco_pope_random_answers.json')
 6 | parser.add_argument('--label-file', default='pope/coco_pope_random.json')
 7 | 
 8 | args = parser.parse_args()
 9 | 
10 | answers = [json.loads(q) for q in open(args.ans_file, 'r')]
11 | label_list = [json.loads(q)['label'] for q in open(args.label_file, 'r')]
12 | 
13 | for answer in answers:
14 |     text = answer['answer']
15 | 
16 |     # Only keep the first sentence
17 |     if text.find('.') != -1:
18 |         text = text.split('.')[0]
19 | 
20 |     text = text.replace(',', '')
21 |     words = text.split(' ')
22 |     if 'No' in words or 'not' in words or 'no' in words:
23 |         answer['answer'] = 'no'
24 |     else:
25 |         answer['answer'] = 'yes'
26 | 
27 | for i in range(len(label_list)):
28 |     if label_list[i] == 'no':
29 |         label_list[i] = 0
30 |     else:
31 |         label_list[i] = 1
32 | 
33 | pred_list = []
34 | for answer in answers:
35 |     if answer['answer'] == 'no':
36 |         pred_list.append(0)
37 |     else:
38 |         pred_list.append(1)
39 | 
40 | pos = 1
41 | neg = 0
42 | yes_ratio = pred_list.count(1) / len(pred_list)
43 | 
44 | TP, TN, FP, FN = 0, 0, 0, 0
45 | for pred, label in zip(pred_list, label_list):
46 |     if pred == pos and label == pos:
47 |         TP += 1
48 |     elif pred == pos and label == neg:
49 |         FP += 1
50 |     elif pred == neg and label == neg:
51 |         TN += 1
52 |     elif pred == neg and label == pos:
53 |         FN += 1
54 | 
55 | print('TP\tFP\tTN\tFN\t')
56 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
57 | 
58 | precision = float(TP) / float(TP + FP)
59 | recall = float(TP) / float(TP + FN)
60 | f1 = 2*precision*recall / (precision + recall)
61 | acc = (TP + TN) / (TP + TN + FP + FN)
62 | print('Accuracy: {}'.format(acc))
63 | print('Precision: {}'.format(precision))
64 | print('Recall: {}'.format(recall))
65 | print('F1 score: {}'.format(f1))
66 | print('Yes ratio: {}'.format(yes_ratio))


--------------------------------------------------------------------------------
/osprey/eval/pope_eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | from functools import partial
  7 | from transformers import AutoTokenizer, CLIPImageProcessor
  8 | from osprey.constants import IMAGE_TOKEN_INDEX
  9 | from osprey.conversation import conv_templates, SeparatorStyle
 10 | from osprey.mm_utils import tokenizer_image_token
 11 | from osprey.train.train import preprocess_multimodal
 12 | from osprey.train.train import DataArguments
 13 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
 14 | import numpy as np
 15 | from PIL import Image
 16 | import argparse
 17 | 
 18 | data_args = DataArguments()
 19 | data_args.mm_use_im_start_end = False
 20 | data_args.is_multimodal = True
 21 | 
 22 | 
 23 | class POPE_EVAL():
 24 |     def __init__(self, model_path):
 25 |         model_path = os.path.expanduser(model_path)
 26 | 
 27 |         self.tokenizer = AutoTokenizer.from_pretrained(
 28 |             model_path,
 29 |             model_max_length=2048,
 30 |             padding_side="right",
 31 |             use_fast=True
 32 |         )
 33 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 34 |                                                 model_path,
 35 |                                                 torch_dtype=torch.bfloat16,
 36 |                                                 ).cuda()
 37 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 38 | 
 39 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 40 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 41 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 42 |         
 43 |         spi_tokens = ['<mask>', '<pos>']
 44 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 45 |         
 46 |         for m in self.model.modules():
 47 |             m.tokenizer = self.tokenizer
 48 | 
 49 |         vision_tower = self.model.get_vision_tower()
 50 |         if not vision_tower.is_loaded:
 51 |             vision_tower.load_model()
 52 |         vision_tower.to(dtype=torch.float16, device='cuda')
 53 |         
 54 | 
 55 |     def eval(self, root_path, ann_file, answer_file):
 56 |         data_all = [json.loads(l) for l in open(ann_file, 'r')]
 57 |         ans_file = open(answer_file, 'w')
 58 |         
 59 |         for data in tqdm(data_all):
 60 |             try:
 61 |                 img_path = os.path.join(root_path, data['image'])
 62 |                 image = Image.open(img_path).convert('RGB')
 63 |             except:
 64 |                 img_path = os.path.join(root_path, data['image'].split('_')[-1])
 65 |                 image = Image.open(img_path).convert('RGB')
 66 |             
 67 |             init_inputs = get_init_inputs(image,
 68 |                                         self.image_processor,
 69 |                                         data['text']
 70 |                                         )
 71 | 
 72 |             image = init_inputs['image']
 73 | 
 74 |             conv = conv_templates['osprey_v1'].copy()
 75 |             qs = init_inputs['sources'][0][0]['value']
 76 | 
 77 |             conv.append_message(conv.roles[0], qs)
 78 |             conv.append_message(conv.roles[1], None)
 79 |             prompt = conv.get_prompt()
 80 | 
 81 |             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 82 | 
 83 |             self.model.model.tokenizer = self.tokenizer
 84 | 
 85 |             with torch.inference_mode():
 86 | 
 87 |                 self.model.orig_forward = self.model.forward
 88 |                 self.model.forward = partial(self.model.orig_forward,
 89 |                                             img_metas=[None],
 90 |                                             )
 91 | 
 92 |                 output_ids = self.model.generate(
 93 |                     input_ids,
 94 |                     images=image.unsqueeze(0).half().cuda(),
 95 |                     do_sample=True,
 96 |                     temperature=0.2,
 97 |                     max_new_tokens=1024,
 98 |                     use_cache=True,
 99 |                     num_beams=1,
100 |                 )
101 | 
102 |                 self.model.forward = self.model.orig_forward
103 | 
104 |             input_token_len = input_ids.shape[1]
105 |             n_diff_input_output = (
106 |                 input_ids != output_ids[:, :input_token_len]).sum().item()
107 |             if n_diff_input_output > 0:
108 |                 print(
109 |                     f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
110 |             outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
111 |                                                 skip_special_tokens=True)[0]
112 | 
113 |             
114 |             ans_file.write(json.dumps({"question": data['text'],
115 |                                        "answer": outputs.lower()})+"\n")
116 |            
117 | 
118 | def get_init_inputs(image,
119 |                     processor,
120 |                     input_question):
121 | 
122 |     image = processor.preprocess(image,
123 |                                     do_center_crop=False,
124 |                                     return_tensors='pt')['pixel_values'][0]
125 | 
126 |     image = torch.nn.functional.interpolate(image.unsqueeze(0),
127 |                                             size=(512, 512),
128 |                                             mode='bilinear',
129 |                                             align_corners=False).squeeze(0)
130 | 
131 | 
132 |     cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
133 | 
134 |     sources = dict()
135 |     sources['conversations'] = []
136 | 
137 |     question = '<image>\n'+input_question
138 | 
139 | 
140 |     sources['conversations'].append({'from': 'human', 'value': question})
141 |     
142 |     sources = preprocess_multimodal([sources['conversations']], data_args, cur_token_len)
143 | 
144 |     data_dict = {}
145 |     data_dict['sources'] = sources
146 |     data_dict['image'] = image
147 |     return data_dict
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     parser = argparse.ArgumentParser(description='osprey demo', formatter_class=argparse.RawTextHelpFormatter)
152 |     parser.add_argument('--model', help='path to osprey model', default='osprey-7b')
153 |     parser.add_argument('--img', help='path to coco imgs', default='/path/to/coco-imgs')
154 |     parser.add_argument('--json', help='path to pope val json file', default='pope/coco_pope_random.json') #'pope/coco_pope_adversarial.json', 'pope/coco_pope_popular.json', 'pope/coco_pope_random.json'
155 |     parser.add_argument('--answer', help='path to answer json file', default='./osprey_pope_random_answer.json')
156 | 
157 |     args = parser.parse_args()
158 | 
159 |     POPE_EVAL = POPE_EVAL(args.model)
160 |     POPE_EVAL.eval(args.img, args.json, args.answer)
161 | 
162 | 


--------------------------------------------------------------------------------
/osprey/eval/pope_eval.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | for type in random popular adversarial
 3 |     do 
 4 |         python pope_eval.py --model path/to/osprey-chat-7b \
 5 |         --img path/to/coco_imgs --json pope/coco_pope_${type}.json \
 6 |         --answer pope/coco_pope_${type}_answers.json
 7 | done
 8 | 
 9 | for type in random popular adversarial
10 |     do 
11 |         echo "Evaluating pope on ${type} data..."
12 |         python pope/evaluate.py --ans-file pope/coco_pope_${type}_answers.json  \
13 |         --label-file pope/coco_pope_${type}.json
14 | done


--------------------------------------------------------------------------------
/osprey/eval/refcocog_eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | from functools import partial
  7 | from transformers import AutoTokenizer, CLIPImageProcessor
  8 | from osprey.constants import IMAGE_TOKEN_INDEX
  9 | from osprey.conversation import conv_templates, SeparatorStyle
 10 | from osprey.mm_utils import tokenizer_image_token
 11 | from osprey.train.train import preprocess_multimodal
 12 | from osprey.train.train import DataArguments
 13 | from osprey.model.language_model.osprey_llama import OspreyLlamaForCausalLM
 14 | from pycocotools.coco import COCO
 15 | from pycocotools import mask as maskUtils
 16 | import numpy as np
 17 | from PIL import Image
 18 | data_args = DataArguments()
 19 | data_args.mm_use_im_start_end = False
 20 | data_args.is_multimodal = True
 21 | 
 22 | def annToMask(ann, h, w):
 23 |     rles = maskUtils.frPyObjects(ann, h, w)
 24 |     rle = maskUtils.merge(rles)
 25 |     m = maskUtils.decode(rle)
 26 |     return m
 27 | 
 28 | class REFCOCOG_EVAL():
 29 |     def __init__(self, model_path,):
 30 |         model_path = os.path.expanduser(model_path)
 31 | 
 32 |         self.tokenizer = AutoTokenizer.from_pretrained(
 33 |             model_path,
 34 |             model_max_length=2048,
 35 |             padding_side="right",
 36 |             use_fast=True
 37 |         )
 38 |         self.model = OspreyLlamaForCausalLM.from_pretrained(
 39 |                                                 model_path,
 40 |                                                 torch_dtype=torch.bfloat16,
 41 |                                                 ).cuda()
 42 |         self.tokenizer.pad_token = self.tokenizer.unk_token
 43 | 
 44 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":512}, resample=3,  do_center_crop=True, crop_size={"height": 512, "width": 512},
 45 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
 46 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
 47 |         
 48 |         spi_tokens = ['<mask>', '<pos>']
 49 |         self.tokenizer.add_tokens(spi_tokens, special_tokens=True)
 50 |         
 51 |         for m in self.model.modules():
 52 |             m.tokenizer = self.tokenizer
 53 | 
 54 |         vision_tower = self.model.get_vision_tower()
 55 |         if not vision_tower.is_loaded:
 56 |             vision_tower.load_model()
 57 |         vision_tower.to(dtype=torch.float16, device='cuda')
 58 | 
 59 |     def forward(self, root_path, ann_file, gt_file='captions_refcocog_gt.json', caption_file='captions_refcocog_osprey.json'):
 60 |         self.captions_all = []
 61 |         self.gt_all = {}
 62 |         self.gt_all['images'] = []
 63 |         self.gt_all['annotations'] = []
 64 |         self.root_path = root_path
 65 |         self.coco = COCO(ann_file)
 66 |         self.img_ids = self.coco.getImgIds()
 67 |         for i, img in enumerate(tqdm(self.img_ids)):
 68 |             data = self.coco.loadImgs([img])[0]
 69 |             self.forward_single(data)
 70 | 
 71 |         final = json.dumps(self.captions_all)
 72 |         with open(caption_file,'w') as fw:
 73 |             fw.write(final)
 74 |             fw.close()
 75 |         
 76 |         final = json.dumps(self.gt_all)
 77 |         with open(gt_file,'w') as fw:
 78 |             fw.write(final)
 79 |             fw.close()
 80 | 
 81 |    
 82 |     def forward_single(self, inputs):
 83 |     
 84 |         img_path = os.path.join(self.root_path, inputs['file_name'].split('_')[-1])
 85 |         height = inputs['height']
 86 |         width = inputs['width']
 87 |         round_ids = 0
 88 |         last_source = dict()
 89 |         annotations_ids = self.coco.getAnnIds([inputs['id']])
 90 |         annotations = self.coco.loadAnns(annotations_ids)
 91 |         for i in range(len(annotations)):
 92 |             caption = {}
 93 |             gt = {}
 94 |             ann = annotations[i]
 95 |             mask_r = ann['segmentation']
 96 | 
 97 |             if isinstance(mask_r, list):
 98 |                 mask = annToMask(mask_r, height, width)
 99 |             else:
100 |                 mask = maskUtils.decode(mask_r)
101 |             mask = torch.from_numpy(mask).unsqueeze(0)
102 |         
103 |             init_inputs = get_init_inputs(img_path,
104 |                                         self.image_processor,
105 |                                         self.tokenizer,
106 |                                         mask=mask,
107 |                                         round_ids=round_ids,
108 |                                         last_round_source=last_source,
109 |                                         )
110 | 
111 |             round_ids += 1
112 |             last_source = init_inputs
113 | 
114 |             image = init_inputs['image']
115 | 
116 |             masks = init_inputs['masks'].cuda()
117 | 
118 |             conv = conv_templates['osprey_v1'].copy()
119 |             qs = init_inputs['sources'][0][0]['value']
120 | 
121 |             conv.append_message(conv.roles[0], qs)
122 |             conv.append_message(conv.roles[1], None)
123 |             prompt = conv.get_prompt()
124 | 
125 |             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
126 | 
127 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
128 | 
129 |             self.model.model.tokenizer = self.tokenizer
130 | 
131 | 
132 |             with torch.inference_mode():
133 | 
134 |                 self.model.orig_forward = self.model.forward
135 |                 self.model.forward = partial(self.model.orig_forward,
136 |                                              img_metas=[None],
137 |                                              masks=[masks.half()])
138 | 
139 |                 output_ids = self.model.generate(
140 |                     input_ids,
141 |                     images=image.unsqueeze(0).half().cuda(),
142 |                     do_sample=True,
143 |                     temperature=0.2,
144 |                     max_new_tokens=1024,
145 |                     use_cache=True,
146 |                     num_beams=1,
147 |                 )
148 | 
149 |                 self.model.forward = self.model.orig_forward
150 | 
151 |             input_token_len = input_ids.shape[1]
152 |             n_diff_input_output = (
153 |                 input_ids != output_ids[:, :input_token_len]).sum().item()
154 |             if n_diff_input_output > 0:
155 |                 print(
156 |                     f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
157 |             outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
158 |                                                   skip_special_tokens=True)[0]
159 | 
160 |             outputs = outputs.strip()
161 |             if outputs.endswith(stop_str):
162 |                 outputs = outputs[:-len(stop_str)]
163 |             outputs = outputs.strip()
164 |             if ':' in outputs:
165 |                 outputs = outputs.split(':')[1]
166 |             
167 |             print(outputs)
168 |             outputs = outputs.replace('.', '.\n')
169 |             caption['image_id'] = str(ann['id'])
170 |             caption['caption'] = outputs
171 |             gt['id'] = str(ann['id'])
172 |             gt['image_id'] = str(ann['id'])
173 |             gt['caption'] = inputs['caption']
174 | 
175 |             self.captions_all.append(caption)
176 |             self.gt_all['annotations'].append(gt)
177 |             self.gt_all['images'].append({'id':str(ann['id'])})
178 | 
179 | 
180 | 
181 | def get_init_inputs(img_path,
182 |                     processor,
183 |                     pred_bboxes,
184 |                     mask,
185 |                     round_ids=0,
186 |                     last_round_source=None):
187 | 
188 |     if round_ids == 0:
189 |        
190 |         image = Image.open(img_path).convert('RGB')
191 | 
192 |         image = processor.preprocess(image,
193 |                                      do_center_crop=False,
194 |                                      return_tensors='pt')['pixel_values'][0]
195 | 
196 |         image = torch.nn.functional.interpolate(image.unsqueeze(0),
197 |                                                 size=(512, 512),
198 |                                                 mode='bilinear',
199 |                                                 align_corners=False).squeeze(0)
200 | 
201 |     else:
202 |         image = last_round_source['image']
203 | 
204 |     cur_token_len = (image.shape[1] // 16) * (image.shape[2] // 16)
205 | 
206 |     mask = mask.to(image.device)
207 | 
208 |     begin_str = """<image>.\nThis provides an overview of the picture.\n"""
209 | 
210 |     sources = dict()
211 |     sources['conversations'] = []
212 |     question = 'Please give me a short description of <mask><pos>.'
213 | 
214 |     sources['conversations'].append({'from': 'human', 'value': begin_str+question})
215 |     
216 |     sources = preprocess_multimodal([sources['conversations']], data_args, cur_token_len)
217 | 
218 |     data_dict = {}
219 |     data_dict['sources'] = sources
220 |     data_dict['image'] = image
221 |     data_dict['masks'] = mask
222 | 
223 |     return data_dict
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     parser = argparse.ArgumentParser(description='osprey demo', formatter_class=argparse.RawTextHelpFormatter)
228 |     parser.add_argument('--model', help='path to osprey model', default='path/to/Osprey-7B-refcocog-fintune')
229 |     parser.add_argument('--img', help='path to coco imgs', default='path/to/coco_all_imgs/')
230 |     parser.add_argument('--json', help='path to refcocog val json file', default='./finetune_refcocog_val_with_mask.json')
231 |     args = parser.parse_args()
232 | 
233 |     refcocog_eval = REFCOCOG_EVAL(args.model)
234 |     refcocog_eval.forward(args.img, args.json)
235 | 
236 | 


--------------------------------------------------------------------------------
/osprey/eval/rule.json:
--------------------------------------------------------------------------------
1 | {
2 |     "description":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image.  In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. Also, several region description are given, each describing a box region of image, with detailed coordinates. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
3 | }


--------------------------------------------------------------------------------
/osprey/eval/summarize_gpt_score.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reference: https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/summarize_gpt_review.py
 3 | """
 4 | 
 5 | import json
 6 | import os
 7 | from collections import defaultdict
 8 | 
 9 | import numpy as np
10 | 
11 | import argparse
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
15 |     parser.add_argument('-d', '--dir', default=None)
16 |     parser.add_argument('-f', '--files', nargs='*', default=None)
17 |     parser.add_argument('-i', '--ignore', nargs='*', default=None)
18 |     parser.add_argument('-s', '--save', action='store_true')
19 |     return parser.parse_args()
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     args = parse_args()
24 | 
25 |     review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl')]
26 | 
27 |     metrics = []
28 |     for review_file in sorted(review_files):
29 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
30 |         scores = defaultdict(list)
31 |         print(config)
32 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
33 |             for review_str in f:
34 |                 review = json.loads(review_str)
35 |                 if args.ignore is not None and review['question_id'] in args.ignore:
36 |                     continue
37 |                 if 'category' in review:
38 |                     scores[review['category']].append(review['tuple'])
39 |                     scores['all'].append(review['tuple'])
40 |                 else:
41 |                     if 'tuple' in review:
42 |                         scores['all'].append(review['tuple'])
43 |                     else:
44 |                         scores['all'].append(review['score'])
45 |         summ_dict = defaultdict(list)
46 |         for k, v in sorted(scores.items()):
47 |             stats = np.asarray(v).mean(0).tolist()
48 |             stats = [round(x, 3) for x in stats]
49 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
50 |             print(k, round(stats[1]/stats[0]*100, 2))
51 |             summ_dict[k] = round(stats[1]/stats[0]*100, 2)
52 |         print('=================================')
53 |         metrics.append(summ_dict)
54 |     
55 |     if args.save:
56 |         with open(os.path.join(args.dir, 'metric.json'), 'w') as f:
57 |             json.dump(metrics, f, indent=2)
58 |             
59 | 
60 | 
61 |         


--------------------------------------------------------------------------------
/osprey/eval/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CircleRadon/Osprey/deb0e57d7e771e8d2a8652236b0e348efb9b5c6a/osprey/eval/utils/__init__.py


--------------------------------------------------------------------------------
/osprey/eval/utils/ade20k_150_with_prompt_eng.txt:
--------------------------------------------------------------------------------
  1 | 0:invalid_class_id
  2 | 1:wall,walls,brick wall,stone wall,interior wall
  3 | 2:building,buildings,edifice,edifices
  4 | 3:sky,clouds
  5 | 4:floor,flooring
  6 | 5:tree,trees
  7 | 6:ceiling
  8 | 7:road,route,street,roads,streets,routes
  9 | 8:bed,beds
 10 | 9:windowpane,window,windows
 11 | 10:grass,grass field
 12 | 11:cabinet,cabinets,wall mounted cabine
 13 | 12:sidewalk,pavement
 14 | 13:person,child,girl,boy,woman,man,people,children,girls,boys,women,men
 15 | 14:earth,ground
 16 | 15:door,double door,doors
 17 | 16:table,tables,tablecloth
 18 | 17:mountain,mount,mountains
 19 | 18:plant,flora,plant life,plants,bushes
 20 | 19:curtain,drape,drapery,mantle,pall
 21 | 20:chair,chairs
 22 | 21:car,automobile,cars
 23 | 22:water
 24 | 23:painting,picture,paintings,pictures,wallart,framed canvas
 25 | 24:sofa,couch,sofas,couches
 26 | 25:shelf,shelves
 27 | 26:house exterior
 28 | 27:sea,ocean
 29 | 28:mirror,mirrors
 30 | 29:rug,carpet,carpeting
 31 | 30:field
 32 | 31:armchair,armchairs
 33 | 32:seat,seats
 34 | 33:fence,fencing
 35 | 34:desk,desks
 36 | 35:rock,stone,rocks,stones
 37 | 36:wardrobe,closet,press,wardrobes,closets
 38 | 37:lamp,lamps
 39 | 38:bathtub,bathing tub,bath,tub
 40 | 39:railing,rail
 41 | 40:cushion,cushions
 42 | 41:pedestal
 43 | 42:box,boxes
 44 | 43:column,pillar
 45 | 44:signboard,sign,signboards,signs
 46 | 45:chest of drawers,chest,bureau,dresser
 47 | 46:counter
 48 | 47:sand
 49 | 48:sink
 50 | 49:skyscraper,skyscrapers
 51 | 50:fireplace,hearth,open fireplace
 52 | 51:refrigerator,icebox
 53 | 52:grandstand,covered stand
 54 | 53:path
 55 | 54:stairs,steps
 56 | 55:runway
 57 | 56:case,display case,showcase,vitrine
 58 | 57:pool table,billiard table,snooker table
 59 | 58:pillow,pillows
 60 | 59:screen door,shower door
 61 | 60:stairway,staircase
 62 | 61:river
 63 | 62:bridge,span
 64 | 63:bookcase
 65 | 64:window screen,door screen
 66 | 65:coffee table,cocktail table
 67 | 66:toilet,commode,crapper,potty
 68 | 67:flower,flowers
 69 | 68:book,books
 70 | 69:hill
 71 | 70:bench,benches
 72 | 71:countertop,counter top,worktop
 73 | 72:stove,kitchen stove,kitchen range,kitchen range,cooking stove
 74 | 73:palm tree,palm trees
 75 | 74:kitchen island
 76 | 75:computer,computing machine,computing device,data processor,electronic computer,information processing system
 77 | 76:swivel chair
 78 | 77:boat
 79 | 78:bar
 80 | 79:arcade machine,arcade machines
 81 | 80:hovel,hut,hutch,shack,shanty
 82 | 81:bus,autobus,double-decker,jitney,motorbus,motorcoach,omnibus,passenger vehicle
 83 | 82:towel
 84 | 83:light bulb,lightbulb,bulb,incandescent lamp,electric light,electric-light bulb
 85 | 84:truck,motortruck
 86 | 85:tower,towers
 87 | 86:chandelier,pendant,pendent
 88 | 87:awning,sunshade,sunblind
 89 | 88:streetlight,street lamp
 90 | 89:booth,cubicle,stall,kiosk
 91 | 90:television receiver,television,television set,tv,tv set
 92 | 91:airplane,aeroplane,airplanes,aeroplanes
 93 | 92:dirt track
 94 | 93:apparel,wearing apparel,dress,clothes
 95 | 94:pole
 96 | 95:land,soil
 97 | 96:bannister,banister,balustrade,balusters,handrail
 98 | 97:escalator,moving staircase,moving stairway
 99 | 98:ottoman,pouf,pouffe,puff,hassock
100 | 99:bottle,bottles,water bottle
101 | 100:buffet,sideboard
102 | 101:poster,posting,placard,notice,bill,card
103 | 102:stage
104 | 103:van
105 | 104:ship
106 | 105:fountain
107 | 106:conveyer belt,conveyor belt,conveyer,conveyor,transporter
108 | 107:canopy
109 | 108:washer,automatic washer,washing machine
110 | 109:plaything,toy,toys
111 | 110:swimming pool,swimming bath
112 | 111:stool,stools
113 | 112:barrel,cask,barrels,casks
114 | 113:basket,handbasket
115 | 114:waterfall,falls
116 | 115:tent,collapsible shelter
117 | 116:bag,bags,gift bag,paper bag
118 | 117:minibike,motorbike
119 | 118:cradle
120 | 119:oven
121 | 120:ball,balls
122 | 121:food,solid food
123 | 122:step,stair
124 | 123:tank,storage tank
125 | 124:trade name,brand name,brand,marque
126 | 125:microwave,microwave oven
127 | 126:plant pots,plant pot,flower pot,flowerpot,planter
128 | 127:animal,animate being,dog,cat,horse,cow,sheep,zebra,girraffe,bird
129 | 128:bicycle,bike
130 | 129:lake
131 | 130:dishwasher,dish washer,dishwashing machine
132 | 131:projection screen
133 | 132:blanket,cover
134 | 133:sculpture,sculptures
135 | 134:exhaust hood
136 | 135:sconce,sconce lamp,sconce light
137 | 136:vase,vases
138 | 137:traffic light,traffic signal,traffic lights
139 | 138:tray,trays
140 | 139:ashcan,trash can,garbage can,wastebin,ash bin,ash-bin,ashbin,dustbin,trash barrel,trash bin
141 | 140:ceiling fan,floor fan
142 | 141:pier,wharf,wharfage,dock
143 | 142:crt screen
144 | 143:plate,plates
145 | 144:monitor,monitoring device,monitors
146 | 145:bulletin board,notice board
147 | 146:shower
148 | 147:radiator
149 | 148:cup,cups,drinking glass,drinking glasses
150 | 149:clock
151 | 150:flag,flags


--------------------------------------------------------------------------------
/osprey/eval/utils/cityscapes_with_prompt_eng.txt:
--------------------------------------------------------------------------------
 1 | 0:road,railroad
 2 | 1:sidewalk,pavement
 3 | 2:building,buildings,edifice,edifices,house,ceiling
 4 | 3:wall,walls,brick wall,stone wall,tile wall,wood wall
 5 | 4:fence,fences
 6 | 5:pole,poles
 7 | 6:traffic light,traffic lights
 8 | 7:traffic sign,stop sign
 9 | 8:vegetation,tree,trees,palm tree,bushes
10 | 9:terrain,river,sand,sea,snow,water,mountain,grass,dirt,rock
11 | 10:sky,clouds
12 | 11:person
13 | 12:rider
14 | 13:car,cars
15 | 14:truck,trucks
16 | 15:bus,buses
17 | 16:train,trains,locomotive,locomotives,freight train
18 | 17:motorcycle,motorcycles
19 | 18:bicycle,bicycles,bike,bikes


--------------------------------------------------------------------------------
/osprey/eval/utils/instance_evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py
  3 | """
  4 | 
  5 | import contextlib
  6 | import copy
  7 | import io
  8 | import itertools
  9 | import json
 10 | import logging
 11 | import numpy as np
 12 | import os
 13 | import pickle
 14 | from collections import OrderedDict
 15 | import pycocotools.mask as mask_util
 16 | import torch
 17 | from pycocotools.coco import COCO
 18 | from pycocotools.cocoeval import COCOeval
 19 | from tabulate import tabulate
 20 | 
 21 | import detectron2.utils.comm as comm
 22 | from detectron2.config import CfgNode
 23 | from detectron2.data import MetadataCatalog
 24 | from detectron2.data.datasets.coco import convert_to_coco_json
 25 | from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
 26 | from detectron2.evaluation.fast_eval_api import COCOeval_opt
 27 | from detectron2.structures import Boxes, BoxMode, pairwise_iou
 28 | from detectron2.utils.file_io import PathManager
 29 | from detectron2.utils.logger import create_small_table
 30 | 
 31 | 
 32 | # modified from COCOEvaluator for instance segmetnat
 33 | class InstanceSegEvaluator(COCOEvaluator):
 34 |     """
 35 |     Evaluate AR for object proposals, AP for instance detection/segmentation, AP
 36 |     for keypoint detection outputs using COCO's metrics.
 37 |     See http://cocodataset.org/#detection-eval and
 38 |     http://cocodataset.org/#keypoints-eval to understand its metrics.
 39 |     The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
 40 |     the metric cannot be computed (e.g. due to no predictions made).
 41 | 
 42 |     In addition to COCO, this evaluator is able to support any bounding box detection,
 43 |     instance segmentation, or keypoint detection dataset.
 44 |     """
 45 | 
 46 |     def _eval_predictions(self, predictions, img_ids=None):
 47 |         """
 48 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
 49 |         """
 50 |         self._logger.info("Preparing results for COCO format ...")
 51 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
 52 |         tasks = self._tasks or self._tasks_from_predictions(coco_results)
 53 | 
 54 |         # unmap the category ids for COCO
 55 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
 56 |             dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
 57 |             # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
 58 |             # num_classes = len(all_contiguous_ids)
 59 |             # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
 60 | 
 61 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
 62 |             for result in coco_results:
 63 |                 category_id = result["category_id"]
 64 |                 # assert category_id < num_classes, (
 65 |                 #     f"A prediction has class={category_id}, "
 66 |                 #     f"but the dataset only has {num_classes} classes and "
 67 |                 #     f"predicted class id should be in [0, {num_classes - 1}]."
 68 |                 # )
 69 |                 assert category_id in reverse_id_mapping, (
 70 |                     f"A prediction has class={category_id}, "
 71 |                     f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
 72 |                 )
 73 |                 result["category_id"] = reverse_id_mapping[category_id]
 74 | 
 75 |         if self._output_dir:
 76 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
 77 |             self._logger.info("Saving results to {}".format(file_path))
 78 |             with PathManager.open(file_path, "w") as f:
 79 |                 f.write(json.dumps(coco_results))
 80 |                 f.flush()
 81 | 
 82 |         if not self._do_evaluation:
 83 |             self._logger.info("Annotations are not available for evaluation.")
 84 |             return
 85 | 
 86 |         self._logger.info(
 87 |             "Evaluating predictions with {} COCO API...".format(
 88 |                 "unofficial" if self._use_fast_impl else "official"
 89 |             )
 90 |         )
 91 |         for task in sorted(tasks):
 92 |             assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
 93 |             coco_eval = (
 94 |                 _evaluate_predictions_on_coco(
 95 |                     self._coco_api,
 96 |                     coco_results,
 97 |                     task,
 98 |                     kpt_oks_sigmas=self._kpt_oks_sigmas,
 99 |                     #use_fast_impl=self._use_fast_impl,
100 |                     img_ids=img_ids,
101 |                     max_dets_per_image=self._max_dets_per_image,
102 |                 )
103 |                 if len(coco_results) > 0
104 |                 else None  # cocoapi does not handle empty results very well
105 |             )
106 | 
107 |             res = self._derive_coco_results(
108 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
109 |             )
110 |             self._results[task] = res
111 | 


--------------------------------------------------------------------------------
/osprey/eval/utils/register_ade20k_panoptic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_panoptic.py
  3 | """
  4 | 
  5 | import json
  6 | import os
  7 | 
  8 | from detectron2.data import DatasetCatalog, MetadataCatalog
  9 | from detectron2.utils.file_io import PathManager
 10 | from detectron2.data.datasets.coco import load_sem_seg
 11 | 
 12 | 
 13 | from . import openseg_classes
 14 | 
 15 | ADE20K_150_CATEGORIES = openseg_classes.get_ade20k_categories_with_prompt_eng()
 16 | 
 17 | ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]
 18 | 
 19 | MetadataCatalog.get("openvocab_ade20k_sem_seg_train").set(
 20 |     stuff_colors=ADE20k_COLORS[:],
 21 | )
 22 | 
 23 | MetadataCatalog.get("openvocab_ade20k_sem_seg_val").set(
 24 |     stuff_colors=ADE20k_COLORS[:],
 25 | )
 26 | 
 27 | 
 28 | def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
 29 |     """
 30 |     Args:
 31 |         image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
 32 |         gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
 33 |         json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
 34 |     Returns:
 35 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 36 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 37 |     """
 38 | 
 39 |     def _convert_category_id(segment_info, meta):
 40 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 41 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 42 |                 segment_info["category_id"]
 43 |             ]
 44 |             segment_info["isthing"] = True
 45 |         else:
 46 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 47 |                 segment_info["category_id"]
 48 |             ]
 49 |             segment_info["isthing"] = False
 50 |         return segment_info
 51 | 
 52 |     with PathManager.open(json_file) as f:
 53 |         json_info = json.load(f)
 54 | 
 55 |     ret = []
 56 |     for ann in json_info["annotations"]:
 57 |         image_id = ann["image_id"]
 58 |         # TODO: currently we assume image and label has the same filename but
 59 |         # different extension, and images have extension ".jpg" for COCO. Need
 60 |         # to make image extension a user-provided argument if we extend this
 61 |         # function to support other COCO-like datasets.
 62 |         image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
 63 |         label_file = os.path.join(gt_dir, ann["file_name"])
 64 |         sem_label_file = os.path.join(semseg_dir, ann["file_name"])
 65 |         segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
 66 |         ret.append(
 67 |             {
 68 |                 "file_name": image_file,
 69 |                 "image_id": image_id,
 70 |                 "pan_seg_file_name": label_file,
 71 |                 "sem_seg_file_name": sem_label_file,
 72 |                 "segments_info": segments_info,
 73 |             }
 74 |         )
 75 |     assert len(ret), f"No images found in {image_dir}!"
 76 |     assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
 77 |     assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
 78 |     assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
 79 |     return ret
 80 | 
 81 | 
 82 | def register_ade20k_panoptic(
 83 |     name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None
 84 | ):
 85 |     """
 86 |     Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
 87 |     The dictionaries in this registered dataset follows detectron2's standard format.
 88 |     Hence it's called "standard".
 89 |     Args:
 90 |         name (str): the name that identifies a dataset,
 91 |             e.g. "ade20k_panoptic_train"
 92 |         metadata (dict): extra metadata associated with this dataset.
 93 |         image_root (str): directory which contains all the images
 94 |         panoptic_root (str): directory which contains panoptic annotation images in COCO format
 95 |         panoptic_json (str): path to the json panoptic annotation file in COCO format
 96 |         sem_seg_root (none): not used, to be consistent with
 97 |             `register_coco_panoptic_separated`.
 98 |         instances_json (str): path to the json instance annotation file
 99 |     """
100 |     panoptic_name = name
101 |     DatasetCatalog.register(
102 |         panoptic_name,
103 |         lambda: load_ade20k_panoptic_json(
104 |             panoptic_json, image_root, panoptic_root, semantic_root, metadata
105 |         ),
106 |     )
107 |     MetadataCatalog.get(panoptic_name).set(
108 |         panoptic_root=panoptic_root,
109 |         image_root=image_root,
110 |         panoptic_json=panoptic_json,
111 |         json_file=instances_json,
112 |         evaluator_type="ade20k_panoptic_seg",
113 |         ignore_label=255,
114 |         label_divisor=1000,
115 |         **metadata,
116 |     )
117 | 
118 | 
119 | _PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
120 |     "openvocab_ade20k_panoptic_train": (
121 |         "ADEChallengeData2016/images/training",
122 |         "ADEChallengeData2016/ade20k_panoptic_train",
123 |         "ADEChallengeData2016/ade20k_panoptic_train.json",
124 |         "ADEChallengeData2016/annotations_detectron2/training",
125 |         "ADEChallengeData2016/ade20k_instance_train.json",
126 |     ),
127 |     "openvocab_ade20k_panoptic_val": (
128 |         "ADEChallengeData2016/images/validation",
129 |         "ADEChallengeData2016/ade20k_panoptic_val",
130 |         "ADEChallengeData2016/ade20k_panoptic_val.json",
131 |         "ADEChallengeData2016/annotations_detectron2/validation",
132 |         "ADEChallengeData2016/ade20k_instance_val.json",
133 |     ),
134 | }
135 | 
136 | 
137 | def get_metadata():
138 |     meta = {}
139 |     # The following metadata maps contiguous id from [0, #thing categories +
140 |     # #stuff categories) to their names and colors. We have to replica of the
141 |     # same name and color under "thing_*" and "stuff_*" because the current
142 |     # visualization function in D2 handles thing and class classes differently
143 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
144 |     # enable reusing existing visualization functions.
145 |     thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
146 |     thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
147 |     stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES]
148 |     stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES]
149 | 
150 |     meta["thing_classes"] = thing_classes
151 |     meta["thing_colors"] = thing_colors
152 |     meta["stuff_classes"] = stuff_classes
153 |     meta["stuff_colors"] = stuff_colors
154 | 
155 |     # Convert category id for training:
156 |     #   category id: like semantic segmentation, it is the class id for each
157 |     #   pixel. Since there are some classes not used in evaluation, the category
158 |     #   id is not always contiguous and thus we have two set of category ids:
159 |     #       - original category id: category id in the original dataset, mainly
160 |     #           used for evaluation.
161 |     #       - contiguous category id: [0, #classes), in order to train the linear
162 |     #           softmax classifier.
163 |     thing_dataset_id_to_contiguous_id = {}
164 |     stuff_dataset_id_to_contiguous_id = {}
165 | 
166 |     for i, cat in enumerate(ADE20K_150_CATEGORIES):
167 |         if cat["isthing"]:
168 |             thing_dataset_id_to_contiguous_id[cat["id"]] = i
169 |         # else:
170 |         #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
171 | 
172 |         # in order to use sem_seg evaluator
173 |         stuff_dataset_id_to_contiguous_id[cat["id"]] = i
174 | 
175 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
176 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
177 | 
178 |     return meta
179 | 
180 | 
181 | def register_all_ade20k_panoptic(root):
182 |     metadata = get_metadata()
183 |     for (
184 |         prefix,
185 |         (image_root, panoptic_root, panoptic_json, semantic_root, instance_json),
186 |     ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
187 |         # The "standard" version of COCO panoptic segmentation dataset,
188 |         # e.g. used by Panoptic-DeepLab
189 |         register_ade20k_panoptic(
190 |             prefix,
191 |             metadata,
192 |             os.path.join(root, image_root),
193 |             os.path.join(root, panoptic_root),
194 |             os.path.join(root, semantic_root),
195 |             os.path.join(root, panoptic_json),
196 |             os.path.join(root, instance_json),
197 |         )
198 | 
199 | def register_all_ade20k_semantic(root):
200 |     root = os.path.join(root, "ADEChallengeData2016")
201 |     for name, dirname in [("train", "training"), ("val", "validation")]:
202 |         image_dir = os.path.join(root, "images", dirname)
203 |         gt_dir = os.path.join(root, "annotations_detectron2", dirname)
204 |         name = f"openvocab_ade20k_sem_seg_{name}"
205 |         DatasetCatalog.register(
206 |             name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg")
207 |         )
208 |         MetadataCatalog.get(name).set(
209 |             stuff_classes=[x["name"] for x in ADE20K_150_CATEGORIES],
210 |             image_root=image_dir,
211 |             sem_seg_root=gt_dir,
212 |             evaluator_type="sem_seg",
213 |             ignore_label=255,
214 |         )
215 | 
216 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
217 | register_all_ade20k_panoptic(_root)
218 | register_all_ade20k_semantic(_root)


--------------------------------------------------------------------------------
/osprey/eval/utils/register_cityscapes_panoptic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/cityscapes_panoptic.py
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | import os
  8 | 
  9 | from detectron2.data import DatasetCatalog, MetadataCatalog
 10 | from detectron2.utils.file_io import PathManager
 11 | 
 12 | from . import openseg_classes
 13 | 
 14 | CITYSCAPES_CATEGORIES = openseg_classes.get_cityscapes_categories_with_prompt_eng()
 15 | 
 16 | """
 17 | This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog.
 18 | """
 19 | 
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info):
 25 |     files = []
 26 |     # scan through the directory
 27 |     cities = PathManager.ls(image_dir)
 28 |     logger.info(f"{len(cities)} cities found in '{image_dir}'.")
 29 |     image_dict = {}
 30 |     for city in cities:
 31 |         city_img_dir = os.path.join(image_dir, city)
 32 |         for basename in PathManager.ls(city_img_dir):
 33 |             image_file = os.path.join(city_img_dir, basename)
 34 | 
 35 |             suffix = "_leftImg8bit.png"
 36 |             assert basename.endswith(suffix), basename
 37 |             basename = os.path.basename(basename)[: -len(suffix)]
 38 | 
 39 |             image_dict[basename] = image_file
 40 | 
 41 |     for ann in json_info["annotations"]:
 42 |         image_file = image_dict.get(ann["image_id"], None)
 43 |         assert image_file is not None, "No image {} found for annotation {}".format(
 44 |             ann["image_id"], ann["file_name"]
 45 |         )
 46 |         label_file = os.path.join(gt_dir, ann["file_name"])
 47 |         segments_info = ann["segments_info"]
 48 | 
 49 |         files.append((image_file, label_file, segments_info))
 50 | 
 51 |     assert len(files), "No images found in {}".format(image_dir)
 52 |     assert PathManager.isfile(files[0][0]), files[0][0]
 53 |     assert PathManager.isfile(files[0][1]), files[0][1]
 54 |     return files
 55 | 
 56 | 
 57 | def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta):
 58 |     """
 59 |     Args:
 60 |         image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
 61 |         gt_dir (str): path to the raw annotations. e.g.,
 62 |             "~/cityscapes/gtFine/cityscapes_panoptic_train".
 63 |         gt_json (str): path to the json file. e.g.,
 64 |             "~/cityscapes/gtFine/cityscapes_panoptic_train.json".
 65 |         meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id"
 66 |             and "stuff_dataset_id_to_contiguous_id" to map category ids to
 67 |             contiguous ids for training.
 68 | 
 69 |     Returns:
 70 |         list[dict]: a list of dicts in Detectron2 standard format. (See
 71 |         `Using Custom Datasets </tutorials/datasets.html>`_ )
 72 |     """
 73 | 
 74 |     def _convert_category_id(segment_info, meta):
 75 |         if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
 76 |             segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
 77 |                 segment_info["category_id"]
 78 |             ]
 79 |         else:
 80 |             segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
 81 |                 segment_info["category_id"]
 82 |             ]
 83 |         return segment_info
 84 | 
 85 |     assert os.path.exists(
 86 |         gt_json
 87 |     ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files."  # noqa
 88 |     with open(gt_json) as f:
 89 |         json_info = json.load(f)
 90 |     files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info)
 91 |     ret = []
 92 |     for image_file, label_file, segments_info in files:
 93 |         sem_label_file = (
 94 |             image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png"
 95 |         )
 96 |         segments_info = [_convert_category_id(x, meta) for x in segments_info]
 97 |         ret.append(
 98 |             {
 99 |                 "file_name": image_file,
100 |                 "image_id": "_".join(
101 |                     os.path.splitext(os.path.basename(image_file))[0].split("_")[:3]
102 |                 ),
103 |                 "sem_seg_file_name": sem_label_file,
104 |                 "pan_seg_file_name": label_file,
105 |                 "segments_info": segments_info,
106 |             }
107 |         )
108 |     assert len(ret), f"No images found in {image_dir}!"
109 |     assert PathManager.isfile(
110 |         ret[0]["sem_seg_file_name"]
111 |     ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
112 |     assert PathManager.isfile(
113 |         ret[0]["pan_seg_file_name"]
114 |     ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py"  # noqa
115 |     return ret
116 | 
117 | 
118 | # rename to avoid conflict
119 | _RAW_CITYSCAPES_PANOPTIC_SPLITS = {
120 |     "openvocab_cityscapes_fine_panoptic_train": (
121 |         "cityscapes/leftImg8bit/train",
122 |         "cityscapes/gtFine/cityscapes_panoptic_train",
123 |         "cityscapes/gtFine/cityscapes_panoptic_train.json",
124 |     ),
125 |     "openvocab_cityscapes_fine_panoptic_val": (
126 |         "cityscapes/leftImg8bit/val",
127 |         "cityscapes/gtFine/cityscapes_panoptic_val",
128 |         "cityscapes/gtFine/cityscapes_panoptic_val.json",
129 |     ),
130 |     # "cityscapes_fine_panoptic_test": not supported yet
131 | }
132 | 
133 | 
134 | def register_all_cityscapes_panoptic(root):
135 |     meta = {}
136 |     # The following metadata maps contiguous id from [0, #thing categories +
137 |     # #stuff categories) to their names and colors. We have to replica of the
138 |     # same name and color under "thing_*" and "stuff_*" because the current
139 |     # visualization function in D2 handles thing and class classes differently
140 |     # due to some heuristic used in Panoptic FPN. We keep the same naming to
141 |     # enable reusing existing visualization functions.
142 |     thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
143 |     thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
144 |     stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
145 |     stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
146 | 
147 |     meta["thing_classes"] = thing_classes
148 |     meta["thing_colors"] = thing_colors
149 |     meta["stuff_classes"] = stuff_classes
150 |     meta["stuff_colors"] = stuff_colors
151 | 
152 |     # There are three types of ids in cityscapes panoptic segmentation:
153 |     # (1) category id: like semantic segmentation, it is the class id for each
154 |     #   pixel. Since there are some classes not used in evaluation, the category
155 |     #   id is not always contiguous and thus we have two set of category ids:
156 |     #       - original category id: category id in the original dataset, mainly
157 |     #           used for evaluation.
158 |     #       - contiguous category id: [0, #classes), in order to train the classifier
159 |     # (2) instance id: this id is used to differentiate different instances from
160 |     #   the same category. For "stuff" classes, the instance id is always 0; for
161 |     #   "thing" classes, the instance id starts from 1 and 0 is reserved for
162 |     #   ignored instances (e.g. crowd annotation).
163 |     # (3) panoptic id: this is the compact id that encode both category and
164 |     #   instance id by: category_id * 1000 + instance_id.
165 |     thing_dataset_id_to_contiguous_id = {}
166 |     stuff_dataset_id_to_contiguous_id = {}
167 | 
168 |     for k in CITYSCAPES_CATEGORIES:
169 |         if k["isthing"] == 1:
170 |             thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
171 |         else:
172 |             stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
173 | 
174 |     meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
175 |     meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
176 | 
177 |     for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items():
178 |         image_dir = os.path.join(root, image_dir)
179 |         gt_dir = os.path.join(root, gt_dir)
180 |         gt_json = os.path.join(root, gt_json)
181 | 
182 |         DatasetCatalog.register(
183 |             key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta)
184 |         )
185 |         MetadataCatalog.get(key).set(
186 |             panoptic_root=gt_dir,
187 |             image_root=image_dir,
188 |             panoptic_json=gt_json,
189 |             gt_dir=gt_dir.replace("cityscapes_panoptic_", ""),
190 |             evaluator_type="cityscapes_panoptic_seg",
191 |             ignore_label=255,
192 |             label_divisor=1000,
193 |             **meta,
194 |         )
195 | 
196 | _root = os.getenv("DETECTRON2_DATASETS", "datasets")
197 | register_all_cityscapes_panoptic(_root)


--------------------------------------------------------------------------------
/osprey/mm_utils.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from io import BytesIO
 3 | import base64
 4 | 
 5 | import torch
 6 | from transformers import StoppingCriteria
 7 | from osprey.constants import IMAGE_TOKEN_INDEX
 8 | 
 9 | 
10 | def load_image_from_base64(image):
11 |     return Image.open(BytesIO(base64.b64decode(image)))
12 | 
13 | def expand2square(pil_img, background_color):
14 |     width, height = pil_img.size
15 |     if width == height:
16 |         return pil_img
17 |     elif width > height:
18 |         result = Image.new(pil_img.mode, (width, width), background_color)
19 |         result.paste(pil_img, (0, (width - height) // 2))
20 |         return result
21 |     else:
22 |         result = Image.new(pil_img.mode, (height, height), background_color)
23 |         result.paste(pil_img, ((height - width) // 2, 0))
24 |         return result
25 | 
26 | 
27 | def process_images(images, image_processor, model_cfg):
28 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
29 |     new_images = []
30 |     if image_aspect_ratio == 'pad':
31 |         for image in images:
32 |             image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
33 |             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
34 |             new_images.append(image)
35 |     else:
36 |         return image_processor(images, return_tensors='pt')['pixel_values']
37 |     if all(x.shape == new_images[0].shape for x in new_images):
38 |         new_images = torch.stack(new_images, dim=0)
39 |     return new_images
40 | 
41 | 
42 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
43 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
44 | 
45 |     def insert_separator(X, sep):
46 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
47 | 
48 |     input_ids = []
49 |     offset = 0
50 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
51 |         offset = 1
52 |         input_ids.append(prompt_chunks[0][0])
53 | 
54 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
55 |         input_ids.extend(x[offset:])
56 | 
57 |     if return_tensors is not None:
58 |         if return_tensors == 'pt':
59 |             return torch.tensor(input_ids, dtype=torch.long)
60 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
61 |     return input_ids
62 | 
63 | 
64 | def get_model_name_from_path(model_path):
65 |     model_path = model_path.strip("/")
66 |     model_paths = model_path.split("/")
67 |     if model_paths[-1].startswith('checkpoint-'):
68 |         return model_paths[-2] + "_" + model_paths[-1]
69 |     else:
70 |         return model_paths[-1]
71 | 
72 | class KeywordsStoppingCriteria(StoppingCriteria):
73 |     def __init__(self, keywords, tokenizer, input_ids):
74 |         self.keywords = keywords
75 |         self.keyword_ids = []
76 |         for keyword in keywords:
77 |             cur_keyword_ids = tokenizer(keyword).input_ids
78 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
79 |                 cur_keyword_ids = cur_keyword_ids[1:]
80 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
81 |         self.tokenizer = tokenizer
82 |         self.start_len = input_ids.shape[1]
83 | 
84 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
85 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
86 |         offset = min(output_ids.shape[1] - self.start_len, 3)
87 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
88 |         for keyword_id in self.keyword_ids:
89 |             if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
90 |                 return True
91 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
92 |         for keyword in self.keywords:
93 |             if keyword in outputs:
94 |                 return True
95 |         return False
96 | 


--------------------------------------------------------------------------------
/osprey/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.osprey_llama import OspreyLlamaForCausalLM, OspreyConfig
2 | 


--------------------------------------------------------------------------------
/osprey/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | 
 4 | import torch
 5 | from transformers import AutoTokenizer, AutoModelForCausalLM
 6 | from osprey.model import *
 7 | from osprey.model.utils import auto_upgrade
 8 | 
 9 | 
10 | def consolidate_ckpt(src_path, dst_path):
11 |     print("Loading model")
12 |     auto_upgrade(src_path)
13 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
14 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
15 |     src_model.save_pretrained(dst_path)
16 |     src_tokenizer.save_pretrained(dst_path)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--src", type=str, required=True)
22 |     parser.add_argument("--dst", type=str, required=True)
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     consolidate_ckpt(args.src, args.dst)
27 | 


--------------------------------------------------------------------------------
/osprey/model/language_model/osprey_llama.py:
--------------------------------------------------------------------------------
  1 | #    Licensed under the Apache License, Version 2.0 (the "License");
  2 | #    you may not use this file except in compliance with the License.
  3 | #    You may obtain a copy of the License at
  4 | #
  5 | #        http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | #    Unless required by applicable law or agreed to in writing, software
  8 | #    distributed under the License is distributed on an "AS IS" BASIS,
  9 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | #    See the License for the specific language governing permissions and
 11 | #    limitations under the License.
 12 | 
 13 | 
 14 | from typing import List, Optional, Tuple, Union
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | from torch.nn import CrossEntropyLoss
 19 | 
 20 | from transformers import AutoConfig, AutoModelForCausalLM, \
 21 |                          LlamaConfig, LlamaModel, LlamaForCausalLM
 22 | 
 23 | from transformers.modeling_outputs import CausalLMOutputWithPast
 24 | 
 25 | from ..osprey_arch import OspreyMetaModel, OspreyMetaForCausalLM
 26 | 
 27 | from ..layer import MaskExtractor
 28 | 
 29 | class OspreyConfig(LlamaConfig):
 30 |     model_type = "osprey"
 31 | 
 32 | 
 33 | class OspreyLlamaModel(OspreyMetaModel, LlamaModel):
 34 |     config_class = OspreyConfig
 35 | 
 36 |     def __init__(self, config: LlamaConfig):
 37 |         super(OspreyLlamaModel, self).__init__(config)
 38 | 
 39 | 
 40 | class OspreyLlamaForCausalLM(LlamaForCausalLM, OspreyMetaForCausalLM):
 41 |     config_class = OspreyConfig
 42 | 
 43 |     def __init__(self, config):
 44 |         super(LlamaForCausalLM, self).__init__(config)
 45 |         self.model = OspreyLlamaModel(config)
 46 | 
 47 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 48 | 
 49 |         self.mask_extractor = MaskExtractor()
 50 | 
 51 |         # Initialize weights and apply final processing
 52 |         self.post_init()
 53 | 
 54 |     def get_model(self):
 55 |         return self.model
 56 | 
 57 |     def forward(
 58 |         self,
 59 |         input_ids: torch.LongTensor = None,
 60 |         attention_mask: Optional[torch.Tensor] = None,
 61 |         img_metas = None,
 62 |         masks = None,
 63 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 64 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 65 |         labels: Optional[torch.LongTensor] = None,
 66 |         use_cache: Optional[bool] = None,
 67 |         output_attentions: Optional[bool] = None,
 68 |         output_hidden_states: Optional[bool] = None,
 69 |         images: Optional[torch.FloatTensor] = None,
 70 |         return_dict: Optional[bool] = None,
 71 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 72 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 73 |         output_hidden_states = (
 74 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
 75 |         )
 76 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 77 | 
 78 |         input_token_len = input_ids.shape[1]
 79 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, masks, attention_mask, past_key_values, labels, images)
 80 | 
 81 |         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
 82 | 
 83 |         if inputs_embeds is not None:
 84 |             inputs_embeds = inputs_embeds.bfloat16()
 85 |   
 86 |         self.model = self.model.bfloat16()
 87 | 
 88 |         outputs = self.model(
 89 |             input_ids=input_ids,
 90 |             attention_mask=attention_mask,
 91 |             past_key_values=past_key_values,
 92 |             inputs_embeds=inputs_embeds,
 93 |             use_cache=use_cache,
 94 |             output_attentions=output_attentions,
 95 |             output_hidden_states=output_hidden_states,
 96 |             return_dict=return_dict
 97 |         )
 98 | 
 99 |         hidden_states = outputs[0]
100 |         self.lm_head = self.lm_head.to(hidden_states.dtype)
101 |         logits = self.lm_head(hidden_states)
102 | 
103 |         loss = None
104 |         if labels is not None:
105 |             # Shift so that tokens < n predict n
106 |             shift_logits = logits[..., :-1, :].contiguous()
107 |             shift_labels = labels[..., 1:].contiguous()
108 |             # Flatten the tokens
109 |             loss_fct = CrossEntropyLoss()
110 |             shift_logits = shift_logits.view(-1, self.config.vocab_size)
111 |             shift_labels = shift_labels.view(-1)
112 |             # Enable model/pipeline parallelism
113 |             shift_labels = shift_labels.to(shift_logits.device)
114 |             loss = loss_fct(shift_logits, shift_labels)
115 | 
116 |         if not return_dict:
117 |             output = (logits,) + outputs[1:]
118 | 
119 |             return (loss,) + output if loss is not None else output
120 | 
121 |         return CausalLMOutputWithPast(
122 |             loss=loss,
123 |             logits=logits,
124 |             past_key_values=outputs.past_key_values,
125 |             hidden_states=outputs.hidden_states,
126 |             attentions=outputs.attentions,
127 |         )
128 | 
129 |     def prepare_inputs_for_generation(
130 |         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
131 |     ):
132 |         if past_key_values:
133 |             input_ids = input_ids[:, -1:]
134 | 
135 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
136 |         if inputs_embeds is not None and past_key_values is None:
137 |             model_inputs = {"inputs_embeds": inputs_embeds}
138 |         else:
139 |             model_inputs = {"input_ids": input_ids}
140 | 
141 |         model_inputs.update(
142 |             {
143 |                 "past_key_values": past_key_values,
144 |                 "use_cache": kwargs.get("use_cache"),
145 |                 "attention_mask": attention_mask,
146 |                 "images": kwargs.get("images", None),
147 |             }
148 |         )
149 |         return model_inputs
150 | 
151 | AutoConfig.register("osprey", OspreyConfig)
152 | AutoModelForCausalLM.register(OspreyConfig, OspreyLlamaForCausalLM)
153 | 


--------------------------------------------------------------------------------
/osprey/model/layer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class MLP(nn.Module):
  7 | 
  8 |     def __init__(self, input_dim: int, hidden_dim: int, output_dim: int,
  9 |                  num_layers: int) -> None:
 10 |         super().__init__()
 11 |         self.num_layers = num_layers
 12 |         h = [hidden_dim] * (num_layers - 1)
 13 |         self.layers = nn.ModuleList(
 14 |             nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 15 | 
 16 |     def forward(self, x):
 17 |         for i, layer in enumerate(self.layers):
 18 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
 19 |         return x
 20 | 
 21 | 
 22 | class MaskExtractor(nn.Module):
 23 |     def __init__(self, mask_shape=112, embed_dim=1024, out_dim=4096):
 24 |         super(MaskExtractor, self).__init__()
 25 |         self.mask_shape = mask_shape
 26 |         self.mask_pooling = MaskPooling()
 27 |         self.feat_linear = nn.Linear(embed_dim, out_dim)
 28 |         self.mask_linear = MLP(mask_shape*mask_shape, embed_dim, out_dim, 3)
 29 |         # self.res_linear = {}
 30 |         self.feature_name = ['res2', 'res3', 'res4', 'res5']
 31 | 
 32 |         # for i, feat in enumerate(self.feature_name):
 33 |         #     self.res_linear[feat] = nn.Linear(192*(2**i), embed_dim)
 34 | 
 35 |         self.res2 = nn.Linear(192, 1024)
 36 |         self.res3 = nn.Linear(384, 1024)
 37 |         self.res4 = nn.Linear(768, 1024)
 38 |         self.res5 = nn.Linear(1536, 1024)
 39 | 
 40 |     def forward(self, feats, masks):
 41 |         query_feats = []
 42 |         pos_feats = []
 43 |         if masks is None:
 44 |             return query_feats, pos_feats
 45 | 
 46 |         num_imgs = len(masks)
 47 | 
 48 |         for idx in range(num_imgs):
 49 |             mask = masks[idx].unsqueeze(0).float()
 50 | 
 51 |             num_feats = len(self.feature_name)
 52 |             mask_feats = mask.new_zeros(num_feats, mask.shape[1], 1024)
 53 |             for i, name in enumerate(self.feature_name):
 54 |                 feat = feats[name][idx].unsqueeze(0)
 55 | 
 56 |                 raw_dtype = feat.dtype
 57 |                 feat = feat.to(mask.dtype)
 58 |                 mask_feat_raw = self.mask_pooling(feat, mask)
 59 |                 
 60 |                 mask_feat_flatten = mask_feat_raw.reshape(-1, mask_feat_raw.shape[-1])
 61 | 
 62 |                 # self.res_linear[name] = self.res_linear[name].to(dtype=mask_feat_flatten.dtype, device=mask_feat_flatten.device)
 63 |                 # mask_feat = self.res_linear[name](mask_feat_flatten)
 64 | 
 65 |                 if name=='res2':
 66 |                     self.res2 = self.res2.to(device=mask_feat_flatten.device, dtype=mask_feat_flatten.dtype)
 67 |                     mask_feat = self.res2(mask_feat_flatten)
 68 |                 elif name=='res3':
 69 |                     self.res3 = self.res3.to(device=mask_feat_flatten.device, dtype=mask_feat_flatten.dtype)
 70 |                     mask_feat = self.res3(mask_feat_flatten)
 71 |                 elif name=='res4':
 72 |                     self.res4 = self.res4.to(device=mask_feat_flatten.device, dtype=mask_feat_flatten.dtype)
 73 |                     mask_feat = self.res4(mask_feat_flatten)
 74 |                 else:
 75 |                     self.res5 = self.res5.to(device=mask_feat_flatten.device, dtype=mask_feat_flatten.dtype)
 76 |                     mask_feat = self.res5(mask_feat_flatten)
 77 | 
 78 |                 mask_feat = mask_feat.reshape(*mask_feat_raw.shape[:2], -1)
 79 |                 mask_feat = mask_feat.to(raw_dtype)
 80 |                
 81 |                 mask_feats[i] = mask_feat[0]
 82 |             mask_feats = mask_feats.sum(0)
 83 |             self.feat_linear = self.feat_linear.to(dtype=mask_feats.dtype, device=mask_feats.device)
 84 |             mask_feats_linear = self.feat_linear(mask_feats)
 85 |             query_feats.append(mask_feats_linear)
 86 | 
 87 |             # position
 88 |             mask = F.interpolate(mask, size=self.mask_shape, mode='bilinear', align_corners=False)
 89 |             self.mask_linear = self.mask_linear.to(dtype=mask.dtype, device=mask.device)
 90 |             pos_feat = self.mask_linear(mask.reshape(mask.shape[1], -1))
 91 |             pos_feats.append(pos_feat)
 92 | 
 93 |         return query_feats, pos_feats
 94 | 
 95 |     
 96 | class MaskPooling(nn.Module):
 97 |     def __init__(self):
 98 |         super().__init__()
 99 | 
100 |     def forward(self, x, mask):
101 | 
102 |         if not x.shape[-2:] == mask.shape[-2:]:
103 |             # reshape mask to x
104 |             mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False)
105 | 
106 |         b, c, h ,w = x.shape
107 |         b, q, h, w = mask.shape
108 |         mask = (mask > 0).to(mask.dtype)
109 |         denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8
110 | 
111 |         mask_pooled_x = torch.einsum(
112 |             "bchw,bqhw->bqc",
113 |             x,
114 |             mask / denorm,
115 |         )
116 |         return mask_pooled_x
117 | 


--------------------------------------------------------------------------------
/osprey/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, delay_load=False):
 6 | 
 7 |     return CLIPVisionTower(args=vision_tower_cfg)
 8 | 
 9 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
10 | 


--------------------------------------------------------------------------------
/osprey/model/multimodal_encoder/clip.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import torch.nn as nn
 4 | 
 5 | from open_clip.model import _build_vision_tower
 6 | 
 7 | 
 8 | class CLIP(nn.Module):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         model_name = 'convnext_large'
12 | 
13 |         vision_cfg = {'timm_model_name': model_name, 'timm_model_pretrained': False, 'timm_pool': '', 'timm_proj': 'mlp', 'timm_drop': 0.0, 'timm_drop_path': 0.1, 'image_size': 320}
14 |         self.visual = _build_vision_tower(embed_dim=768, vision_cfg=vision_cfg, quick_gelu=False)
15 | 
16 |         self.eval()
17 |         self.freeze_everything()
18 | 
19 |     def freeze_everything(self):
20 |         for param in self.visual.parameters():
21 |             param.requires_grad = False
22 | 
23 |     def extract_features(self, x):
24 |         out = {}
25 |         x = x.to(self.visual.trunk.stem.state_dict()['1.bias'].dtype)
26 |         x = self.visual.trunk.stem(x)
27 |         out['stem'] = x.contiguous() 
28 |         for i in range(4):
29 |             x = self.visual.trunk.stages[i](x)
30 |             out[f'res{i+2}'] = x.contiguous() 
31 |         
32 |         x = self.visual.trunk.norm_pre(x)
33 |         out['clip_vis_dense'] = x.contiguous()
34 |         return out
35 | 
36 |     def forward(self, x):
37 |         self.eval()
38 |         with torch.no_grad():
39 |             return self.extract_features(x)
40 |     


--------------------------------------------------------------------------------
/osprey/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPImageProcessor
 5 | from .clip import CLIP
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, args, img_size=512, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         # test
12 |         if hasattr(args, 'mm_vision_tower'):
13 |             self.clip_model = args.mm_vision_tower
14 |         else: # train
15 |             self.clip_model = args.vision_tower
16 |         self.is_loaded = False
17 |         self.img_size = img_size
18 | 
19 |         if not delay_load:
20 |             self.load_model()
21 | 
22 |     def load_model(self):
23 |         self.image_processor = CLIPImageProcessor(do_resize=True, size={"shortest_edge":self.img_size}, resample=3,  do_center_crop=True, crop_size={"height": self.img_size, "width": self.img_size},
24 |                                                   do_rescale=True, rescale_factor=0.00392156862745098, do_normalize=True, image_mean=[0.48145466, 0.4578275, 0.40821073],
25 |                                                   image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, )
26 | 
27 |         self.vision_tower = CLIP()
28 | 
29 |         self.vision_tower.load_state_dict(torch.load(self.clip_model),strict=False)
30 |         
31 |         self.is_loaded = True
32 | 
33 |     @torch.no_grad()
34 |     def forward(self, images):
35 | 
36 |         if type(images) is list:
37 |             image_features = []
38 |             image_features_dict = []
39 |             for image in images:
40 |                 image_feature_dict = self.vision_tower(image.unsqueeze(0))
41 |                 image_features_dict.append(image_feature_dict)
42 |                 image_feature = image_feature_dict['res4']
43 |                 image_feature = image_feature.reshape(*image_feature.shape[:2],-1).permute(0,2,1)
44 |                 image_features.append(image_feature)
45 |         else:
46 |             image_features_dict = self.vision_tower(images)
47 |             image_features = image_features_dict['res4']
48 |             image_features = image_features.reshape(*image_features.shape[:2],-1).permute(0,2,1)
49 | 
50 |         return image_features, image_features_dict
51 | 
52 |     @property
53 |     def dtype(self):
54 |         return self.vision_tower.dtype
55 | 
56 |     @property
57 |     def device(self):
58 |         return self.vision_tower.device
59 | 


--------------------------------------------------------------------------------
/osprey/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     mm_hidden_size = getattr(config, 'mm_hidden_size', 768)
35 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
36 | 
37 |     if projector_type == 'linear':
38 |         return nn.Linear(mm_hidden_size, config.hidden_size)
39 | 
40 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
41 |     if mlp_gelu_match:
42 |         mlp_depth = int(mlp_gelu_match.group(1))
43 |         modules = [nn.Linear(mm_hidden_size, config.hidden_size)]
44 |         for _ in range(1, mlp_depth):
45 |             modules.append(nn.GELU())
46 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
47 |         return nn.Sequential(*modules)
48 | 
49 |     if projector_type == 'identity':
50 |         return IdentityMap()
51 | 
52 |     raise ValueError(f'Unknown projector type: {projector_type}')
53 | 


--------------------------------------------------------------------------------
/osprey/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | import warnings
  3 | 
  4 | import torch
  5 | 
  6 | import transformers
  7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
  8 | 
  9 | try:
 10 |     from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 11 | except ImportError:
 12 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 13 | from flash_attn.bert_padding import unpad_input, pad_input
 14 | 
 15 | 
 16 | def forward(
 17 |     self,
 18 |     hidden_states: torch.Tensor,
 19 |     attention_mask: Optional[torch.Tensor] = None,
 20 |     position_ids: Optional[torch.Tensor] = None,
 21 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 22 |     output_attentions: bool = False,
 23 |     use_cache: bool = False,
 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 25 |     if output_attentions:
 26 |         warnings.warn(
 27 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 28 |         )
 29 | 
 30 |     bsz, q_len, _ = hidden_states.size()
 31 |     # print("begin_#")
 32 |     query_states = (
 33 |         self.q_proj(hidden_states)
 34 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 35 |         .transpose(1, 2)
 36 |     )
 37 |     # print("OK_#")
 38 |     key_states = (
 39 |         self.k_proj(hidden_states)
 40 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 41 |         .transpose(1, 2)
 42 |     )
 43 |     value_states = (
 44 |         self.v_proj(hidden_states)
 45 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 46 |         .transpose(1, 2)
 47 |     )  # shape: (b, num_heads, s, head_dim)
 48 | 
 49 |     kv_seq_len = key_states.shape[-2]
 50 |     if past_key_value is not None:
 51 |         kv_seq_len += past_key_value[0].shape[-2]
 52 | 
 53 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 54 |     query_states, key_states = apply_rotary_pos_emb(
 55 |         query_states, key_states, cos, sin, position_ids
 56 |     )
 57 | 
 58 |     if past_key_value is not None:
 59 |         # reuse k, v
 60 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 61 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 62 | 
 63 |     past_key_value = (key_states, value_states) if use_cache else None
 64 | 
 65 |     # repeat k/v heads if n_kv_heads < n_heads
 66 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
 67 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
 68 | 
 69 |     # Transform the data into the format required by flash attention
 70 |     qkv = torch.stack([query_states, key_states, value_states], dim=2)
 71 |     qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
 72 |     key_padding_mask = attention_mask
 73 | 
 74 |     if key_padding_mask is None:
 75 |         qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
 76 |         cu_q_lens = torch.arange(
 77 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 78 |         )
 79 |         max_s = q_len
 80 |         output = flash_attn_unpadded_qkvpacked_func(
 81 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 82 |         )
 83 |         output = output.view(bsz, q_len, -1)
 84 |     else:
 85 |         qkv = qkv.reshape(bsz, q_len, -1)
 86 |         qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
 87 |         qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
 88 |         output_unpad = flash_attn_unpadded_qkvpacked_func(
 89 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 90 |         )
 91 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 92 |         output = pad_input(output_unpad, indices, bsz, q_len)
 93 | 
 94 |     return self.o_proj(output), None, past_key_value
 95 | 
 96 | 
 97 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
 98 | # requires the attention mask to be the same as the key_padding_mask
 99 | def _prepare_decoder_attention_mask(
100 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
101 | ):
102 |     # [bsz, seq_len]
103 |     return attention_mask
104 | 
105 | 
106 | def replace_llama_attn_with_flash_attn():
107 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
108 |     if cuda_major < 8:
109 |         warnings.warn(
110 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
111 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
112 |         )
113 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
114 |         _prepare_decoder_attention_mask
115 |     )
116 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
117 | 


--------------------------------------------------------------------------------
/osprey/train/osprey_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from torch.utils.data import Sampler
  6 | 
  7 | from transformers import Trainer
  8 | from transformers.trainer import (
  9 |     has_length,
 10 | )
 11 | from typing import List, Optional
 12 | 
 13 | 
 14 | def maybe_zero_3(param, ignore_status=False, name=None):
 15 |     from deepspeed import zero
 16 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 17 |     if hasattr(param, "ds_id"):
 18 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
 19 |             if not ignore_status:
 20 |                 print(name, 'no ignore status')
 21 |         with zero.GatheredParameters([param]):
 22 |             param = param.data.detach().cpu().clone()
 23 |     else:
 24 |         param = param.detach().cpu().clone()
 25 |     return param
 26 | 
 27 | 
 28 | def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
 29 |     to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
 30 |     to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
 31 |     return to_return
 32 | 
 33 | 
 34 | def split_to_even_chunks(indices, lengths, num_chunks):
 35 |     """
 36 |     Split a list of indices into `chunks` chunks of roughly equal lengths.
 37 |     """
 38 | 
 39 |     if len(indices) % num_chunks != 0:
 40 |         return [indices[i::num_chunks] for i in range(num_chunks)]
 41 | 
 42 |     num_indices_per_chunk = len(indices) // num_chunks
 43 | 
 44 |     chunks = [[] for _ in range(num_chunks)]
 45 |     chunks_lengths = [0 for _ in range(num_chunks)]
 46 |     for index in indices:
 47 |         shortest_chunk = chunks_lengths.index(min(chunks_lengths))
 48 |         chunks[shortest_chunk].append(index)
 49 |         chunks_lengths[shortest_chunk] += lengths[index]
 50 |         if len(chunks[shortest_chunk]) == num_indices_per_chunk:
 51 |             chunks_lengths[shortest_chunk] = float("inf")
 52 | 
 53 |     return chunks
 54 | 
 55 | 
 56 | def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
 57 |     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
 58 |     assert all(l != 0 for l in lengths), "Should not have zero length."
 59 |     mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
 60 |     lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
 61 | 
 62 |     assert len(mm_indices) > 0, "Should have at least one multimodal sample."
 63 |     assert len(lang_indices) > 0, "Should have at least one language sample."
 64 | 
 65 |     mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
 66 |     lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
 67 |     megabatch_size = world_size * batch_size
 68 |     mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
 69 |     lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
 70 | 
 71 |     last_mm = mm_megabatches[-1]
 72 |     last_lang = lang_megabatches[-1]
 73 |     additional_batch = last_mm + last_lang
 74 |     megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
 75 |     megabatch_indices = torch.randperm(len(megabatches), generator=generator)
 76 |     megabatches = [megabatches[i] for i in megabatch_indices]
 77 | 
 78 |     if len(additional_batch) >= megabatch_size:
 79 |         megabatches = [additional_batch[:megabatch_size]] + megabatches
 80 |         additional_batch = additional_batch[megabatch_size:]
 81 | 
 82 |     if len(additional_batch) > 0:
 83 |         megabatches.append(additional_batch)
 84 | 
 85 |     return [i for megabatch in megabatches for i in megabatch]
 86 | 
 87 | 
 88 | def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
 89 |     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
 90 |     indices = torch.randperm(len(lengths), generator=generator)
 91 |     megabatch_size = world_size * batch_size
 92 |     megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
 93 |     megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
 94 |     megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
 95 | 
 96 |     return [i for megabatch in megabatches for batch in megabatch for i in batch]
 97 | 
 98 | 
 99 | class LengthGroupedSampler(Sampler):
100 |     r"""
101 |     Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
102 |     keeping a bit of randomness.
103 |     """
104 | 
105 |     def __init__(
106 |         self,
107 |         batch_size: int,
108 |         world_size: int,
109 |         lengths: Optional[List[int]] = None,
110 |         generator=None,
111 |         group_by_modality: bool = False,
112 |     ):
113 |         if lengths is None:
114 |             raise ValueError("Lengths must be provided.")
115 | 
116 |         self.batch_size = batch_size
117 |         self.world_size = world_size
118 |         self.lengths = lengths
119 |         self.generator = generator
120 |         self.group_by_modality = group_by_modality
121 | 
122 |     def __len__(self):
123 |         return len(self.lengths)
124 | 
125 |     def __iter__(self):
126 |         if self.group_by_modality:
127 |             indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
128 |         else:
129 |             indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
130 |         return iter(indices)
131 | 
132 | 
133 | class OspreyTrainer(Trainer):
134 | 
135 |     def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
136 |         if self.train_dataset is None or not has_length(self.train_dataset):
137 |             return None
138 | 
139 |         if self.args.group_by_modality_length:
140 |             lengths = self.train_dataset.modality_lengths
141 |             return LengthGroupedSampler(
142 |                 # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
143 |                 self.args.train_batch_size,
144 |                 world_size=self.args.world_size,
145 |                 lengths=lengths,
146 |                 group_by_modality=True,
147 |             )
148 |         else:
149 |             return super()._get_train_sampler()
150 | 
151 |     def _save_checkpoint(self, model, trial, metrics=None):
152 |         if getattr(self.args, 'tune_mm_mlp_adapter', False):
153 |             from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
154 |             checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
155 | 
156 |             run_dir = self._get_output_dir(trial=trial)
157 |             output_dir = os.path.join(run_dir, checkpoint_folder)
158 | 
159 |             # Only save Adapter
160 |             keys_to_match = ['mm_projector', 'vision_resampler']
161 |             if getattr(self.args, "use_im_start_end", False):
162 |                 keys_to_match.extend(['embed_tokens', 'embed_in'])
163 | 
164 |             weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
165 | 
166 |             if self.args.local_rank == 0 or self.args.local_rank == -1:
167 |                 self.model.config.save_pretrained(output_dir)
168 |                 torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
169 |         else:
170 |             super(OspreyTrainer, self)._save_checkpoint(model, trial, metrics)
171 | 
172 |     def _save(self, output_dir: Optional[str] = None, state_dict=None):
173 |         if getattr(self.args, 'tune_mm_mlp_adapter', False):
174 |             pass
175 |         else:
176 |             super(OspreyTrainer, self)._save(output_dir, state_dict)
177 | 


--------------------------------------------------------------------------------
/osprey/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from osprey.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | replace_llama_attn_with_flash_attn()
 9 | 
10 | from osprey.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/osprey/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from osprey.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True)
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "osprey"
 7 | version = "1.0"
 8 | description = "Pixel Understanding with Visual Instruction Tuning."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.0.1", "torchvision==0.15.2",
17 |     "transformers==4.31.0",
18 |     "einops", "fastapi", "markdown2[all]", "numpy",
19 |     "requests", "sentencepiece", "tokenizers>=0.12.1",
20 |     "uvicorn", "tensorboard", "open_clip_torch",
21 |     "shortuuid", "httpx==0.24.0",
22 |     "deepspeed==0.9.5",
23 |     "peft==0.4.0",
24 |     "transformers==4.31.0",
25 |     "accelerate==0.21.0",
26 |     "bitsandbytes==0.41.0",
27 |     "scikit-learn==1.2.2",
28 |     "sentencepiece==0.1.99",
29 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
30 |     "gradio_client==0.2.9",
31 |     "pycocotools", "terminaltables", "lvis"
32 | ]
33 | 
34 | [tool.setuptools.packages.find]
35 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
36 | 
37 | [tool.wheel]
38 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
39 | 


--------------------------------------------------------------------------------
/scripts/stage2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=`pwd`:$PYTHONPATH
 3 | export TRAIN_MASK_MODULE=1
 4 | 
 5 | deepspeed --include localhost:0,1,2,3 osprey/train/train_mem.py \
 6 |     --deepspeed ./scripts/zero2.json \
 7 |     --model_name_or_path vicuna-7b-v1.5 \
 8 |     --dataset_config ./osprey/configs/stage2.json \
 9 |     --version v1 \
10 |     --vision_tower laion2b_s29b_b131k_ft_soup.bin \
11 |     --pretrain_mm_mlp_adapter osprey-v1.0-7b-pretrain/mm_projector.bin \
12 |     --mm_projector_type mlp2x_gelu \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --image_aspect_ratio pad \
17 |     --group_by_modality_length True \
18 |     --bf16 True \
19 |     --output_dir './exp/stage2' \
20 |     --num_train_epochs 2 \
21 |     --per_device_train_batch_size 1\
22 |     --per_device_eval_batch_size 4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --evaluation_strategy "no" \
25 |     --save_strategy "steps" \
26 |     --save_steps 10000 \
27 |     --save_total_limit 1 \
28 |     --learning_rate 2e-5 \
29 |     --weight_decay 0. \
30 |     --warmup_ratio 0.03 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --tf32 True \
34 |     --model_max_length 2048 \
35 |     --gradient_checkpointing True \
36 |     --dataloader_num_workers 4 \
37 |     --lazy_preprocess True \
38 |     --report_to "none" \
39 |     --group_by_modality_length False
40 | 


--------------------------------------------------------------------------------
/scripts/stage3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PYTHONPATH=`pwd`:$PYTHONPATH
 3 | 
 4 | deepspeed --include localhost:0,1,2,3 osprey/train/train_mem.py \
 5 |     --deepspeed ./scripts/zero2.json \
 6 |     --model_name_or_path /exp/stage2/checkpoint-final \
 7 |     --dataset_config ./osprey/configs/stage3.json \
 8 |     --version v1 \
 9 |     --vision_tower laion2b_s29b_b131k_ft_soup.bin \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --mm_vision_select_layer -2 \
12 |     --mm_use_im_start_end False \
13 |     --mm_use_im_patch_token False \
14 |     --image_aspect_ratio pad \
15 |     --group_by_modality_length True \
16 |     --bf16 True \
17 |     --output_dir './exp/stage3' \
18 |     --num_train_epochs 2 \
19 |     --per_device_train_batch_size 1\
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 1 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 2000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 1e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --report_to "none" \
37 |     --group_by_modality_length False
38 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e8,
25 |         "stage3_max_reuse_distance": 1e8,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e8,
47 |     "stage3_max_reuse_distance": 1e8,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------