├── .gitattributes ├── .gitignore ├── README.md ├── arguments.py ├── config ├── deepspeed_config.json └── deepspeed_config_zero3.json ├── data_preprocess.py ├── environment.yml ├── example.ipynb ├── images ├── cal_num1.png ├── cal_num2.png ├── cal_num3.png ├── chinchilla.png ├── dataset.png ├── example.png ├── example2.png ├── example3.png ├── flamingo.png ├── flamingo_3d.png ├── flamingo_cartoon.png ├── flamingo_photo.png ├── model_struct.png ├── new_example.png ├── overview.png └── shiba.png ├── model ├── .DS_Store ├── blip2 │ ├── .DS_Store │ ├── __init__.py │ ├── configuration_blip_2.py │ ├── convert_blip_2_original_to_pytorch.py │ ├── modeling_blip_2.py │ └── processing_blip_2.py ├── instructblip │ ├── __init__.py │ ├── configuration_instructblip.py │ ├── convert_instructblip_original_to_pytorch.py │ ├── modeling_instructblip.py │ └── processing_instructblip.py └── utils.py ├── paper.pdf ├── run.py ├── run_script ├── .DS_Store └── flickr │ ├── deep_speed_blip2_t5xl.sh │ ├── deep_speed_blip2_t5xxl.sh │ ├── deep_speed_instructblip_t5xl.sh │ ├── deep_speed_instructblip_t5xxl.sh │ ├── deep_speed_instructblip_vicuna13.sh │ └── deep_speed_instructblip_vicuna7b.sh ├── search.py ├── tasks ├── .DS_Store ├── utils.py └── vqa │ ├── dataset.py │ └── get_trainer.py └── training ├── trainer_base.py ├── trainer_blip2.py └── trainer_instructblip2.py /.gitattributes: -------------------------------------------------------------------------------- 1 | pycocoevalcap/ filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore : -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | **/__pycache__ 6 | __pycache__ 7 | .ipynb_checkpoints 8 | */.ipynb_checkpoints/* 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | prompt_data1/ 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv/ 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | .dmypy.json 116 | dmypy.json 117 | 118 | # Pyre type checker 119 | .pyre/ 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MIC 2 | Visual Language Models (VLMs) have made significant progress in various downstream tasks by developing large-scale multimodal models. However, they sometimes lack reasoning and contextual learning abilities. On the other hand, Large Language Models (LLMs) have revolutionized the NLP community with their strong reasoning and contextual learning capabilities. LLMs can quickly adapt to new tasks involving inference without fine-tuning pre-trained models or parameter updates, such as question answering and commonsense reasoning. 3 | 4 | Studying in context learning abilities contributes to VLMs' ability to generalize new knowledge in lifelong learning environments, develop learnable capabilities, and advance artificial intelligence skills. Therefore, we propose the **MIC(Multimodality In-Context Learning)** dataset. This is a manually constructed instruction tuning dataset supports interleaved text-image inputs, inter-related multiple image inputs, and multimodal in-context learning inputs. By finetuning VLMs on MIC, we enable them to possess multimodal in-context learning capabilities and understand complex relationships between instructions and multiple images. 5 | 6 | # Overview 7 | ![Overview](images/overview.png ) 8 | 9 | # News 🚀 10 | 1. [09-19] We have converted the MMICL demo to a permanent link: [Demo for MMICL](http://www.testmmicl.work). The Vicuna version of MMICL and Chat Mode are presently under development, so they may require careful adjustment of generation parameters and may not work correctly. 11 | 2. [09-15] Our [paper](https://arxiv.org/abs/2309.07915) has been uploaded to arXiv. 12 | 3. [09-01] The [MIC](https://huggingface.co/datasets/BleachNick/MIC_full) data(**both jsonl files and most image data**) has released on the huggingface hub. **Because of a network issue, the [MIC](https://www.modelscope.cn/datasets/BleachNick/MIC/) data, which includes jsonl files and corresponding image files for most datasets, has also been released on ModelScope.** 13 | 4. [08-23] Reach the 1st on [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation), 1st on [MMBench](https://opencompass.org.cn/leaderboard-multimodal) 14 | 5. [08-21] The [MMICL-FLANT5XXL](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xxl) and [MMICL-Tiny](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xl) model has released on the huggingface hub. 15 | 16 | ## MMICL 17 | **MMICL(Multi-Modal In-Context Learning)** is a multimodal vision-language model that incorporates blip2/instrcutblip. It has the ability to analyze and understand multiple images, as well as follow instructions. MMICL outperforms the VL model of the same size and performs exceptionally well on complex visual reasoning datasets. It achieves **state-of-the-art** performance on both multimodal task leaderboards and a 18 | wide range of vision-language tasks. Furthermore, it showcases new capabilities in video understanding and multimodal in-context learning (M-ICL). 19 | 20 | 🔥 Further details of model and dataset will be released ASAP. 21 | 22 | 23 | **Model**: [MMICL-FLANT5XXL](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xxl); [MMICL-Tiny](https://huggingface.co/BleachNick/MMICL-Instructblip-T5-xl) 24 | 25 | **Data**: [MIC_5M](https://huggingface.co/datasets/BleachNick/MIC_full); [MIC_Sampled](https://huggingface.co/datasets/BleachNick/MIC_sampled) 26 | 27 | **Demo for MMICL**: [playground for MMICL-FLANT5XXL](http://testmmicl.work) 28 | 29 | + **Capability of multiple images refering and reasoning** 30 | 31 | + **Manually constructed In-context instruction tuning dataset** 32 | 33 | + Till 21st Aug. 2023 **1st on [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation), 1st on [MMBench](https://opencompass.org.cn/leaderboard-multimodal)** 34 | 35 | + Visual Encoder: VIT-L from CLIP/ ViT-G/14 from EVA-CLIP 36 | 37 | + Pre-trained LLM: FlanT5-XL/ FlanT5-XXL/ Vicuna-7B/ Vicuna-13B 38 | 39 | 40 | 41 | 42 | ## Performance 43 | 44 | ### [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) 45 | 46 | | |**Perception(Overall)**| Existence| Count | Position | Color | Posters | Celebrity | Scene | Landmark | Artwork | OCR | 47 | |-------|------------|-----------|--------|----------|-------|---------|-----------|-------|----------|---------|-------| 48 | | SCORE | **1376.00**| 175.00 | 143.33 | 73.33 | **171.67** | 130.95 | 145.88 | 152.75 | 135.08 | 133.00 | 115.00 | 49 | | RANK | **1** | 4 | 2 | 5 | **1** | 7 | 3 | 6 | 8 | 3 | 2 | 50 | 51 | 52 | | | **Cognition(Overall)** | Commonsense Reasoning | Numerical Calculation | Text Translation | Code Reasoning | 53 | |-------|-----------|-----------------------|-----------------------|------------------|----------------| 54 | | SCORE | **360.36** | 117.86 | 62.50 | **107.50** | 72.50 | 55 | | RANK | **1** | 2 | 5 | **1** | 4 | 56 | 57 | 58 | 59 | 60 | 61 | ### [MMBench](https://opencompass.org.cn/leaderboard-multimodal) 62 | + test set 63 | 64 | | Method | Vision Model | Language Model | Overall | LR | AR | RR | FP-S | FP-C | CP | 65 | |---------|--------------|----------------|----------|-------|--------|-------|------|-------|--------| 66 | | MMICL | EVA-G | FLANT5-XXL | **65.2** | 44.3 | **77.9** | **64.8** | **66.5** | **53.6** | **70.6** | 67 | | [JiuTian](https://github.com/rshaojimmy/JiuTian) | EVA-G | FLANT5-XXL | 64.7 | 46.6 | 76.5 | 66.7 | 66.5 | 51.6 | 68.7 | 68 | 69 | + dev set 70 | 71 | | Method | Vision Model | Language Model | Overall | LR | AR | RR | FP-S | FP-C | CP | 72 | |---------|--------------|----------------|----------|-------|--------|-------|------|-------|--------| 73 | | MMICL | EVA-G | FLANT5-XXL | **67.9** | **49.2** | **77.9** | **73.0** | 66.7 | 57.2 | **77.2** | 74 | | [JiuTian](https://github.com/rshaojimmy/JiuTian) | EVA-G | FLANT5-XXL | 67.1 | 46.7 | 69.7 | 71.3 | 67.3 | 58.6 | 75.8 | 75 | 76 | 77 | ## Reproduce Tips 78 | Since experiments reported in our paper are all conducted on NVIDIA DGX-A40 servers (which might be difficult to acquire), 79 | we reimplement MMICL with: 80 | 81 | * Ubuntu servers with 4/6* NVIDIA GeForce A40 (46G) GPUs 82 | * Install Apex to enable the bf16 training and evaluation 83 | * Cuda 11.3, deepspeed to enable the zero2-offload and zero3 training 84 | * Packages with certain versions (provided below) 85 | 86 | ### Setup 87 | We conduct our experiment with Anaconda3. If you have installed Anaconda3, then create the environment for MMICL: 88 | 89 | ```shell 90 | conda env create -f environment.yml 91 | ``` 92 | 93 | 94 | ### MIC Data 95 | Using multiple data source such as: VCR, VQAv2, GQA, COCO, NLVR2, OKVQK, FILCKR, STVQA, MSRVTT, MSRVTTQA, TextVQA, RefCOCO, WikiArt, DiffusionDB, VSR, LLaVa-Instruct, MiniImagenet 96 | We tranform it into few shot style and stored it into jsonl files: 97 | It forms the all data in to multi instruction style with zero to few-shot form data. 98 | runing the preprocessed script and change the data into raw arrow file for further training: 99 | ```shell 100 | # The detail of MIC dataset will be added ASAP 101 | python data_preprocess.py 102 | ``` 103 | ![Dataset](images/dataset.png ) 104 | #### Format 105 | ``` 106 | 1. Interleaved Image-Text Data 107 | 108 | Input: Image 0 is {image 0} 109 | ... 110 | Image j is {image j} 111 | {question} 112 | 113 | MMICL: {answer} 114 | 115 | 2. In−Context Demonstration Data 116 | 117 | Input: Image 0 is {image 0}. 118 | {question} 119 | {answer} 120 | ... 121 | Image j is {image j}. 122 | {question} 123 | 124 | MMICL: {answer} 125 | ``` 126 | - interleaved image text example 127 | ![interleaved image text example](images/example.png ) 128 | 129 | - relevant image example 130 | ![relevant image example](images/example2.png ) 131 | 132 | - in context example 133 | ![in context example](images/example3.png ) 134 | 135 | ### Examples 136 | ![Examples](images/new_example.png ) 137 | 138 | ### Model Structure 139 | ![Model Structure](images/model_struct.png ) 140 | 141 | ### Training 142 | Run training scripts in [run_script](run_script) : 143 | 144 | ```shell 145 | # For training the MMICL in FLANT5-xxl version 146 | bash run_script/flickr/deep_speed_instructblip_t5xxl.sh 147 | ``` 148 | ### Inference 149 | 150 | 151 | ``` 152 | # For T5 based model 153 | from model.instructblip import InstructBlipConfig, InstructBlipModel, InstructBlipPreTrainedModel,InstructBlipForConditionalGeneration,InstructBlipProcessor 154 | import datasets 155 | import json 156 | import transformers 157 | from PIL import Image 158 | import torch 159 | model_type="instructblip" 160 | model_ckpt="BleachNick/MMICL-Instructblip-T5-xxl" 161 | processor_ckpt = "Salesforce/instructblip-flan-t5-xxl" 162 | config = InstructBlipConfig.from_pretrained(model_ckpt ) 163 | 164 | if 'instructblip' in model_type: 165 | model = InstructBlipForConditionalGeneration.from_pretrained( 166 | model_ckpt, 167 | config=config).to('cuda:0',dtype=torch.bfloat16) 168 | 169 | image_palceholder="图" 170 | sp = [image_palceholder]+[f"" for i in range(20)] 171 | processor = InstructBlipProcessor.from_pretrained( 172 | processor_ckpt 173 | ) 174 | sp = sp+processor.tokenizer.additional_special_tokens[len(sp):] 175 | processor.tokenizer.add_special_tokens({'additional_special_tokens':sp}) 176 | if model.qformer.embeddings.word_embeddings.weight.shape[0] != len(processor.qformer_tokenizer): 177 | model.qformer.resize_token_embeddings(len(processor.qformer_tokenizer)) 178 | replace_token="".join(32*[image_palceholder]) 179 | ``` 180 | #### Images: 181 |

182 | Image 1 183 | Image 2 184 | Image 3 185 |

186 | 187 | ``` 188 | image = Image.open ("images/cal_num1.png") 189 | image1 = Image.open ("images/cal_num2.png") 190 | image2 = Image.open ("images/cal_num3.png") 191 | images = [image,image1,image2] 192 | 193 | prompt = [f'Use the image 0: {replace_token},image 1: {replace_token} and image 2: {replace_token} as a visual aid to help you calculate the equation accurately. image 0 is 2+1=3.\nimage 1 is 5+6=11.\nimage 2 is"'] 194 | prompt = " ".join(prompt) 195 | 196 | inputs = processor(images=images, text=prompt, return_tensors="pt") 197 | 198 | inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) 199 | inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]]) 200 | inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) 201 | 202 | inputs = inputs.to('cuda:0') 203 | outputs = model.generate( 204 | pixel_values = inputs['pixel_values'], 205 | input_ids = inputs['input_ids'], 206 | attention_mask = inputs['attention_mask'], 207 | img_mask = inputs['img_mask'], 208 | do_sample=False, 209 | max_length=50, 210 | min_length=1, 211 | set_min_padding_size =False, 212 | ) 213 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 214 | print(generated_text) 215 | 216 | ``` 217 | #### Output: 218 | 219 | 3x6=18" 220 | 221 | #### Images: 222 |

223 | Image 1 224 | Image 2 225 | Image 3 226 |

227 | 228 | ``` 229 | image = Image.open ("images/chinchilla.png") 230 | image1 = Image.open ("images/shiba.png") 231 | image2 = Image.open ("images/flamingo.png") 232 | images = [image,image1,image2] 233 | images = [image,image1,image2] 234 | prompt = [f'image 0 is {replace_token},image 1 is {replace_token},image 2 is {replace_token}. Question: is a chinchilla. They are mainly found in Chile.\n Question: is a shiba. They are very popular in Japan.\nQuestion: image 2 is'] 235 | 236 | prompt = " ".join(prompt) 237 | 238 | inputs = processor(images=images, text=prompt, return_tensors="pt") 239 | 240 | inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) 241 | inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]]) 242 | inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) 243 | 244 | inputs = inputs.to('cuda:0') 245 | outputs = model.generate( 246 | pixel_values = inputs['pixel_values'], 247 | input_ids = inputs['input_ids'], 248 | attention_mask = inputs['attention_mask'], 249 | img_mask = inputs['img_mask'], 250 | do_sample=False, 251 | max_length=50, 252 | min_length=1, 253 | set_min_padding_size =False, 254 | ) 255 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 256 | print(generated_text) 257 | 258 | 259 | ``` 260 | #### Output: 261 | 262 | a flamingo. They are native to South America and are known for their bright red plumage and distinctive call. 263 | 264 | 265 | #### Images: 266 | 267 |

268 | Image 1 269 | Image 2 270 | Image 3 271 |

272 | 273 | ``` 274 | 275 | image = Image.open ("images/flamingo_photo.png") 276 | image1 = Image.open ("images/flamingo_cartoon.png") 277 | image2 = Image.open ("images/flamingo_3d.png") 278 | 279 | images = [image,image1,image2] 280 | prompt = [f'Use the image 0: {replace_token}, image 1: {replace_token} and image 2: {replace_token} as a visual aids to help you answer the question. Question: Give the reason why image 0, image 1 and image 2 are different? Answer:'] 281 | 282 | prompt = " ".join(prompt) 283 | 284 | inputs = processor(images=images, text=prompt, return_tensors="pt") 285 | 286 | inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) 287 | inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]]) 288 | inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) 289 | 290 | inputs = inputs.to('cuda:0') 291 | outputs = model.generate( 292 | pixel_values = inputs['pixel_values'], 293 | input_ids = inputs['input_ids'], 294 | attention_mask = inputs['attention_mask'], 295 | img_mask = inputs['img_mask'], 296 | do_sample=False, 297 | max_length=80, 298 | min_length=50, 299 | num_beams=8, 300 | set_min_padding_size =False, 301 | ) 302 | generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() 303 | print(generated_text) 304 | 305 | 306 | ``` 307 | Output: 308 | 309 | image 0 is a photo of a flamingo standing in the water, image 1 is a cartoon drawing of a flamingo and image 2 is a low polygon count 3d model animation 310 | 311 | 312 | 313 | 314 | 315 | ## Reference 316 |
**📑 If you find our projects helpful to your research, please consider citing:**
317 | ``` 318 | @article{zhao2023mmicl, 319 | title={MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning}, 320 | author={Zhao, Haozhe and Cai, Zefan and Si, Shuzheng and Ma, Xiaojian and An, Kaikai and Chen, Liang and Liu, Zixuan and Wang, Sheng and Han, Wenjuan and Chang, Baobao}, 321 | journal={arXiv preprint arXiv:2309.07915}, 322 | year={2023} 323 | } 324 | ``` 325 | 326 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import argparse 3 | import dataclasses 4 | from dataclasses import dataclass, field 5 | from typing import Optional 6 | 7 | from transformers import HfArgumentParser, TrainingArguments 8 | 9 | from tasks.utils import * 10 | 11 | 12 | @dataclass 13 | class DataTrainingArguments: 14 | """ 15 | Arguments pertaining to what data we are going to input our model for training and eval. 16 | 17 | Using `HfArgumentParser` we can turn this class 18 | into argparse arguments to be able to specify them on 19 | the command line.training_args 20 | """ 21 | 22 | dataset_name: str = field( 23 | metadata={ 24 | "help": "The name of the dataset to use: " + ", ".join(DATASETS), 25 | "choices": DATASETS 26 | } 27 | ) 28 | dataset_config_name: Optional[str] = field( 29 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 30 | ) 31 | max_seq_length: int = field( 32 | default=128, 33 | metadata={ 34 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 35 | "than this will be truncated, sequences shorter will be padded." 36 | }, 37 | ) 38 | overwrite_cache: bool = field( 39 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 40 | ) 41 | pad_to_max_length: bool = field( 42 | default=True, 43 | metadata={ 44 | "help": "Whether to pad all samples to `max_seq_length`. " 45 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 46 | }, 47 | ) 48 | max_train_samples: Optional[int] = field( 49 | default=None, 50 | metadata={ 51 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 52 | "value if set." 53 | }, 54 | ) 55 | max_eval_samples: Optional[int] = field( 56 | default=None, 57 | metadata={ 58 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 59 | "value if set." 60 | }, 61 | ) 62 | max_predict_samples: Optional[int] = field( 63 | default=None, 64 | metadata={ 65 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 66 | "value if set." 67 | }, 68 | ) 69 | train_file: Optional[str] = field( 70 | default='dialog_version_control/data/ATIS/train.json', metadata={"help": "A csv or a json file containing the training data."} 71 | ) 72 | validation_file: Optional[str] = field( 73 | default=None, metadata={"help": "A csv or a json file containing the validation data."} 74 | ) 75 | test_file: Optional[str] = field( 76 | default='dialog_version_control/data/ATIS/test.json', 77 | metadata={"help": "A csv or a json file containing the test data."} 78 | ) 79 | label_file: Optional[str] = field( 80 | default='dialog_version_control/data/ATIS/label.txt', 81 | metadata={"help": "A txt file containing the label data."} 82 | ) 83 | dev_rate: Optional[float] = field( 84 | default=0.1, 85 | metadata={ 86 | "help": "For spliting a dev set" 87 | }, 88 | ) 89 | use_preprocessed: Optional[bool] = field( 90 | default=False, 91 | metadata={ 92 | "help": "whether to use preprocessed data" 93 | }, 94 | ) 95 | done_preprocess: Optional[bool] = field( 96 | default=False, 97 | metadata={ 98 | "help": "whether has finished the data preprocess " 99 | }, 100 | ) 101 | load_datatype: Optional[str] = field( 102 | default=None, 103 | metadata={ 104 | "help": "json or parquet" 105 | }, 106 | ) 107 | only_evaluate: Optional[bool] = field( 108 | default=False, 109 | metadata={ 110 | "help": "whether to only test the result" 111 | }, 112 | ) 113 | load_from_base64: Optional[bool] = field( 114 | default=False, 115 | metadata={ 116 | "help": "whether to load preprocessed image data from base64" 117 | }, 118 | ) 119 | training_preprocess: Optional[bool] = field( 120 | default=False, 121 | metadata={ 122 | "help": "whether to preprocess data during training" 123 | }, 124 | ) 125 | label_max_length: Optional[int] = field( 126 | default=64, 127 | metadata={ 128 | "help": "label_max_length" 129 | }, 130 | ) 131 | data_dir: Optional[str] = field( 132 | default=None, 133 | metadata={ 134 | "help": "data_dir" 135 | }, 136 | ) 137 | 138 | @dataclass 139 | class ModelArguments: 140 | """ 141 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 142 | """ 143 | model_name_or_path: str = field( 144 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 145 | ) 146 | config_name: Optional[str] = field( 147 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 148 | ) 149 | tokenizer_name: Optional[str] = field( 150 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 151 | ) 152 | cache_dir: Optional[str] = field( 153 | default=None, 154 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 155 | ) 156 | use_fast_tokenizer: bool = field( 157 | default=True, 158 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 159 | ) 160 | model_revision: str = field( 161 | default="main", 162 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 163 | ) 164 | # NOTE 没用到 165 | task_type: Optional[str] = field( 166 | default="language_modeling", 167 | metadata={ 168 | "help": "Design which head to use." 169 | } 170 | ) 171 | eval_type: Optional[str] = field( 172 | default="eval", 173 | metadata={ 174 | "help": "Design which head to use." 175 | } 176 | ) 177 | prompt_type: Optional[str] = field( 178 | default="soft", 179 | metadata={ 180 | "help": "Use hard or soft prompt" 181 | } 182 | ) 183 | template_id: Optional[str] = field( 184 | default="template_0", 185 | metadata={ 186 | "help": "The specific soft prompt template to use" 187 | } 188 | ) 189 | verbalizer_id: Optional[str] = field( 190 | default="verbalizer_0", 191 | metadata={ 192 | "help": "The specific verbalizer to use" 193 | } 194 | ) 195 | prompt_operation: Optional[str] = field( 196 | default="mean", 197 | metadata={ 198 | "help": "Will use max, sum, mean, attention or cross-attention soft prompt tuning during training" 199 | } 200 | ) 201 | hidden_dropout_prob: float = field( 202 | default=0.1, 203 | metadata={ 204 | "help": "The dropout probability used in the models" 205 | } 206 | ) 207 | num_attention_layers: int = field( 208 | default=1, 209 | metadata={ 210 | "help": "" 211 | } 212 | ) 213 | num_attention_heads: int = field( 214 | default=8, 215 | metadata={ 216 | "help": "" 217 | } 218 | ) 219 | whether_PositionalEncoding: bool = field( 220 | default=True, 221 | metadata={ 222 | "help": "" 223 | } 224 | ) 225 | whether_PositionalWiseFeedForward: bool = field( 226 | default=True, 227 | metadata={ 228 | "help": "" 229 | } 230 | ) 231 | fix_deberta: bool = field( 232 | default=True, 233 | metadata={ 234 | "help": "" 235 | } 236 | ) 237 | data_augmentation: Optional[str] = field( 238 | default="none", 239 | metadata={ 240 | "help": "rdrop, AT, mixup, manifold_mixup" 241 | } 242 | ) 243 | model_type: Optional[str] = field( 244 | default="blip2", 245 | metadata={ 246 | "help": "blip2, instructblip" 247 | } 248 | ) 249 | label: Optional[str] = field( 250 | default="label", 251 | metadata={ 252 | "help": "" 253 | } 254 | ) 255 | experiment_name: Optional[str] = field( 256 | default="label", 257 | metadata={ 258 | "help": "" 259 | } 260 | ) 261 | # Negative Sample 262 | negative_sample_num: Optional[int] = field( 263 | default=1, 264 | metadata={ 265 | "help": "" 266 | } 267 | 268 | ) 269 | processor_path: Optional[str] = field( 270 | default=None, 271 | metadata={ 272 | "help": "" 273 | } 274 | ) 275 | backbone_model: Optional[str] = field( 276 | default="flan-t5", 277 | metadata={ 278 | "help": "flan-t5,opt,vicuna" 279 | } 280 | ) 281 | image_place_holder: Optional[str] = field( 282 | default=None, 283 | metadata={ 284 | "help": "place holder for special token" 285 | } 286 | ) 287 | 288 | 289 | @dataclass 290 | class ExtraTrainingArguments(TrainingArguments): 291 | generation_max_length: Optional[int] = field( 292 | default=32, 293 | metadata={ 294 | "help": "generation_max_length" 295 | } 296 | ) 297 | generation_min_length: Optional[int] = field( 298 | default=1, 299 | metadata={ 300 | "help": "generation_min_length" 301 | } 302 | ) 303 | generation_num_beams: Optional[int] = field( 304 | default=1, 305 | metadata={ 306 | "help": "generation_num_beams" 307 | } 308 | ) 309 | predict_with_generate: bool = field( 310 | default=True, 311 | metadata={ 312 | "help": "" 313 | } 314 | ) 315 | few_shot : bool = field( 316 | default=False, 317 | metadata={ 318 | "help": "" 319 | } 320 | ) 321 | using_instruct_qformer: bool = field( 322 | default=True, 323 | metadata={ 324 | "help": "" 325 | } 326 | ) 327 | full_bf16_training: bool = field( 328 | default=False, 329 | metadata={ 330 | "help": "WHETHER TO USE BF16 full TRAINING" 331 | } 332 | ) 333 | def get_args(): 334 | """Parse all the args.""" 335 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ExtraTrainingArguments)) 336 | 337 | args = parser.parse_args_into_dataclasses() 338 | 339 | return args -------------------------------------------------------------------------------- /config/deepspeed_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "optimizer": { 12 | "type": "AdamW", 13 | "params": { 14 | "lr": "auto", 15 | "betas": "auto", 16 | "eps": "auto", 17 | "weight_decay": "auto" 18 | } 19 | }, 20 | 21 | "scheduler": { 22 | "type": "WarmupLR", 23 | "params": { 24 | "warmup_min_lr": "auto", 25 | "warmup_max_lr": "auto", 26 | "warmup_num_steps": "auto" 27 | } 28 | }, 29 | 30 | "zero_optimization": { 31 | "stage": 2, 32 | "offload_param": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_optimizer": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "allgather_partitions": true, 41 | "allgather_bucket_size": 6e7, 42 | "overlap_comm": true, 43 | "reduce_scatter": true, 44 | "reduce_bucket_size": 6e7, 45 | "contiguous_gradients": true 46 | }, 47 | 48 | "gradient_accumulation_steps": "auto", 49 | "gradient_clipping": "auto", 50 | "train_batch_size": "auto", 51 | "train_micro_batch_size_per_gpu": "auto" 52 | } 53 | -------------------------------------------------------------------------------- /config/deepspeed_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "optimizer": { 12 | "type": "AdamW", 13 | "params": { 14 | "lr": "auto", 15 | "betas": "auto", 16 | "eps": "auto", 17 | "weight_decay": "auto" 18 | } 19 | }, 20 | 21 | "scheduler": { 22 | "type": "WarmupLR", 23 | "params": { 24 | "warmup_min_lr": "auto", 25 | "warmup_max_lr": "auto", 26 | "warmup_num_steps": "auto" 27 | } 28 | }, 29 | 30 | "zero_optimization": { 31 | "stage": 3, 32 | "contiguous_gradients": true, 33 | "stage3_max_live_parameters": 1e9, 34 | "stage3_max_reuse_distance": 1e9, 35 | "stage3_prefetch_bucket_size": 1e7, 36 | "stage3_param_persistence_threshold": 1e5, 37 | "reduce_bucket_size": 1e7, 38 | "sub_group_size": 1e9, 39 | 40 | "offload_param": { 41 | "device": "cpu", 42 | "pin_memory": true 43 | }, 44 | "offload_optimizer": { 45 | "device": "cpu", 46 | "pin_memory": true 47 | }, 48 | "overlap_comm": true, 49 | "stage3_gather_16bit_weights_on_model_save": true 50 | }, 51 | 52 | "gradient_accumulation_steps": "auto", 53 | "gradient_clipping": "auto", 54 | "train_batch_size": "auto", 55 | "train_micro_batch_size_per_gpu": "auto" 56 | } 57 | 58 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: mmicl 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=5.1=1_gnu 8 | - blas=1.0=mkl 9 | - bzip2=1.0.8=h7b6447c_0 10 | - ca-certificates=2023.01.10=h06a4308_0 11 | - cffi=1.15.1=py38h5eee18b_3 12 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 13 | - cudatoolkit=11.3.1=h2bc3f7f_2 14 | - ffmpeg=4.3=hf484d3e_0 15 | - flit-core=3.6.0=pyhd3eb1b0_0 16 | - freetype=2.12.1=h4a9f257_0 17 | - giflib=5.2.1=h5eee18b_3 18 | - gmp=6.2.1=h295c915_3 19 | - gnutls=3.6.15=he1e5248_0 20 | - jpeg=9e=h7f8727e_0 21 | - lame=3.100=h7b6447c_0 22 | - lcms2=2.12=h3be6417_0 23 | - ld_impl_linux-64=2.38=h1181459_1 24 | - lerc=3.0=h295c915_0 25 | - libdeflate=1.8=h7f8727e_5 26 | - libffi=3.4.2=h6a678d5_6 27 | - libgcc-ng=11.2.0=h1234567_1 28 | - libgomp=11.2.0=h1234567_1 29 | - libiconv=1.16=h7f8727e_2 30 | - libidn2=2.3.2=h7f8727e_0 31 | - libpng=1.6.37=hbc83047_0 32 | - libstdcxx-ng=11.2.0=h1234567_1 33 | - libtasn1=4.16.0=h27cfd23_0 34 | - libtiff=4.5.0=hecacb30_0 35 | - libunistring=0.9.10=h27cfd23_0 36 | - libwebp=1.2.4=h11a3e52_1 37 | - libwebp-base=1.2.4=h5eee18b_1 38 | - lz4-c=1.9.4=h6a678d5_0 39 | - mkl_fft=1.3.1=py38hd3c417c_0 40 | - mkl_random=1.2.2=py38h51133e4_0 41 | - ncurses=6.3=h5eee18b_3 42 | - nettle=3.7.3=hbbd107a_1 43 | - numpy-base=1.23.5=py38h31eccc5_0 44 | - openh264=2.1.1=h4ff587b_0 45 | - openssl=1.1.1t=h7f8727e_0 46 | - pip=22.3.1=py38h06a4308_0 47 | - pycparser=2.21=pyhd3eb1b0_0 48 | - pysocks=1.7.1=py38h06a4308_0 49 | - python=3.8.15=h7a1cb2a_2 50 | - pytorch-mutex=1.0=cuda 51 | - readline=8.2=h5eee18b_0 52 | - requests=2.28.1=py38h06a4308_0 53 | - setuptools=65.5.0=py38h06a4308_0 54 | - six=1.16.0=pyhd3eb1b0_1 55 | - sqlite=3.40.0=h5082296_0 56 | - tk=8.6.12=h1ccaba5_0 57 | - typing_extensions=4.4.0=py38h06a4308_0 58 | - wheel=0.37.1=pyhd3eb1b0_0 59 | - xz=5.2.8=h5eee18b_0 60 | - zlib=1.2.13=h5eee18b_0 61 | - zstd=1.5.2=ha4553b6_0 62 | - pip: 63 | - absl-py==1.3.0 64 | - accelerate==0.16.0 65 | - aiofiles==23.1.0 66 | - aiohttp==3.8.1 67 | - aiosignal==1.2.0 68 | - altair==4.2.2 69 | - antlr4-python3-runtime==4.9.3 70 | - anyio==3.6.2 71 | - appdirs==1.4.4 72 | - asttokens==2.0.5 73 | - astunparse==1.6.3 74 | - async-timeout==4.0.2 75 | - attrs==22.1.0 76 | - backcall==0.2.0 77 | - brotlipy==0.7.0 78 | - cachetools==5.2.0 79 | - certifi==2022.9.14 80 | - chex==0.1.5 81 | - click==8.0.4 82 | - commonmark==0.9.1 83 | - contourpy==1.0.5 84 | - cryptography==37.0.1 85 | - cycler==0.11.0 86 | - datasets==2.10.1 87 | - debugpy==1.5.1 88 | - decorator==5.1.1 89 | - deepspeed==0.8.3 90 | - dgl==0.9.0 91 | - dill==0.3.5.1 92 | - dm-tree==0.1.7 93 | - docker-pycreds==0.4.0 94 | - docopt==0.6.2 95 | - dpcpp-cpp-rt==2023.0.0 96 | - einops==0.6.0 97 | - entrypoints==0.4 98 | - et-xmlfile==1.1.0 99 | - executing==0.8.3 100 | - fastapi==0.95.1 101 | - ffmpy==0.3.0 102 | - filelock==3.6.0 103 | - flash-attn==1.0.3.post0 104 | - flatbuffers==22.11.23 105 | - flax==0.6.2 106 | - fonttools==4.38.0 107 | - frozenlist==1.3.1 108 | - fschat==0.2.3 109 | - fsspec==2022.8.2 110 | - gast==0.4.0 111 | - gitdb==4.0.10 112 | - gitpython==3.1.31 113 | - google-auth==2.15.0 114 | - google-auth-oauthlib==0.4.6 115 | - google-pasta==0.2.0 116 | - gradio==3.23.0 117 | - grpcio==1.51.1 118 | - h11==0.14.0 119 | - h5py==3.7.0 120 | - hjson==3.1.0 121 | - httpcore==0.17.0 122 | - httpx==0.24.0 123 | - huggingface-hub==0.14.0 124 | - hydra-core==1.2.0 125 | - idna==3.3 126 | - imageio==2.31.1 127 | - imageio-ffmpeg==0.4.8 128 | - importlib-metadata==4.11.3 129 | - importlib-resources==5.10.0 130 | - intel-cmplr-lib-rt==2023.0.0 131 | - intel-cmplr-lic-rt==2023.0.0 132 | - intel-opencl-rt==2023.0.0 133 | - intel-openmp==2023.0.0 134 | - ipykernel==6.15.2 135 | - ipython==8.4.0 136 | - jax==0.3.25 137 | - jaxlib==0.3.25 138 | - jedi==0.18.1 139 | - jinja2==3.1.2 140 | - joblib==1.1.0 141 | - jsonlines==3.1.0 142 | - jsonschema==4.17.3 143 | - jupyter-client==7.3.5 144 | - jupyter-core==4.10.0 145 | - keras==2.11.0 146 | - kiwisolver==1.4.4 147 | - libclang==14.0.6 148 | - linkify-it-py==2.0.0 149 | - markdown==3.4.1 150 | - markdown-it-py==2.2.0 151 | - markdown2==2.4.8 152 | - markupsafe==2.1.1 153 | - matplotlib==3.6.1 154 | - matplotlib-inline==0.1.6 155 | - mdit-py-plugins==0.3.3 156 | - mdurl==0.1.2 157 | - mkl==2023.0.0 158 | - mkl-fft==1.3.1 159 | - mkl-service==2.4.0 160 | - moviepy==1.0.3 161 | - msgpack==1.0.4 162 | - multidict==6.0.2 163 | - multiprocess==0.70.13 164 | - nest-asyncio==1.5.5 165 | - networkx==2.8.4 166 | - ninja==1.11.1 167 | - nltk==3.8.1 168 | - numpy==1.22.4 169 | - oauthlib==3.2.2 170 | - omegaconf==2.2.3 171 | - openai==0.27.2 172 | - opencv-python==4.6.0.66 173 | - openpyxl==3.1.2 174 | - opt-einsum==3.3.0 175 | - optax==0.1.3 176 | - orjson==3.8.10 177 | - packaging==21.3 178 | - pandas==1.4.4 179 | - parso==0.8.3 180 | - pathtools==0.1.2 181 | - pexpect==4.8.0 182 | - pickleshare==0.7.5 183 | - pillow==9.2.0 184 | - pipreqs==0.4.13 185 | - pkgutil-resolve-name==1.3.10 186 | - plotly==5.11.0 187 | - proglog==0.1.10 188 | - prompt-toolkit==3.0.20 189 | - protobuf==3.19.6 190 | - psutil==5.9.0 191 | - ptyprocess==0.7.0 192 | - pure-eval==0.2.2 193 | - py-cpuinfo==9.0.0 194 | - pyarrow==9.0.0 195 | - pyasn1==0.4.8 196 | - pyasn1-modules==0.2.8 197 | - pydantic==1.10.2 198 | - pydub==0.25.1 199 | - pygments==2.11.2 200 | - pyopenssl==22.0.0 201 | - pyparsing==3.0.9 202 | - pyrsistent==0.19.3 203 | - python-dateutil==2.8.2 204 | - python-multipart==0.0.6 205 | - pytz==2022.2.1 206 | - pyyaml==6.0 207 | - pyzmq==23.2.0 208 | - regex==2022.7.9 209 | - requests-oauthlib==1.3.1 210 | - responses==0.18.0 211 | - rich==12.6.0 212 | - rouge==1.0.1 213 | - rsa==4.9 214 | - sacremoses==0.0.43 215 | - scikit-learn==1.1.2 216 | - scipy==1.7.3 217 | - semantic-version==2.10.0 218 | - sentencepiece==0.1.97 219 | - sentry-sdk==1.20.0 220 | - setproctitle==1.3.2 221 | - shortuuid==1.0.11 222 | - sklearn==0.0 223 | - smmap==5.0.0 224 | - sniffio==1.3.0 225 | - stack-data==0.2.0 226 | - starlette==0.26.1 227 | - svgwrite==1.4.3 228 | - swissarmytransformer==0.2.12 229 | - tbb==2021.8.0 230 | - tenacity==8.1.0 231 | - tensorboard==2.11.0 232 | - tensorboard-data-server==0.6.1 233 | - tensorboard-plugin-wit==1.8.1 234 | - tensorboardx==2.5.1 235 | - tensorflow==2.11.0 236 | - tensorflow-estimator==2.11.0 237 | - tensorflow-io-gcs-filesystem==0.28.0 238 | - tensorstore==0.1.28 239 | - termcolor==2.1.1 240 | - threadpoolctl==3.1.0 241 | - tiktoken==0.3.2 242 | - tokenizers==0.13.3 243 | - toolz==0.12.0 244 | - torch==1.12.1+cu113 245 | - torchaudio==0.12.1+cu113 246 | - torchmetrics==0.11.3 247 | - torchvision==0.13.1+cu113 248 | - tornado==6.2 249 | - tqdm==4.64.1 250 | - traitlets==5.1.1 251 | - transformers==4.28.1 252 | - typing-extensions==4.3.0 253 | - uc-micro-py==1.0.1 254 | - urllib3==1.26.15 255 | - uvicorn==0.21.1 256 | - wandb==0.15.0 257 | - wavedrom==2.0.3.post3 258 | - websockets==11.0.2 259 | - werkzeug==2.2.2 260 | - wrapt==1.14.1 261 | - xxhash==3.0.0 262 | - yarg==0.1.9 263 | - yarl==1.8.1 264 | - zipp==3.8.0 265 | prefix: /home/haozhezhao/anaconda3/envs/nlp 266 | -------------------------------------------------------------------------------- /example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# For T5 based model\n", 10 | "from model.instructblip import InstructBlipConfig, InstructBlipModel, InstructBlipPreTrainedModel,InstructBlipForConditionalGeneration,InstructBlipProcessor\n", 11 | "import datasets\n", 12 | "import json\n", 13 | "import transformers\n", 14 | "from PIL import Image\n", 15 | "import torch\n", 16 | "model_type=\"instructblip\"\n", 17 | "model_ckpt=\"/home/haozhezhao/MMICL-Instructblip-T5-xxl\"\n", 18 | "processor_ckpt = \"Salesforce/instructblip-flan-t5-xxl\"\n", 19 | "config = InstructBlipConfig.from_pretrained(model_ckpt )\n", 20 | "\n", 21 | "if 'instructblip' in model_type:\n", 22 | " model = InstructBlipForConditionalGeneration.from_pretrained(\n", 23 | " model_ckpt,\n", 24 | " config=config).to('cuda:0',dtype=torch.bfloat16) \n", 25 | "\n", 26 | "image_palceholder=\"图\"\n", 27 | "sp = [image_palceholder]+[f\"\" for i in range(20)]\n", 28 | "processor = InstructBlipProcessor.from_pretrained(\n", 29 | " processor_ckpt\n", 30 | ")\n", 31 | "sp = sp+processor.tokenizer.additional_special_tokens[len(sp):]\n", 32 | "processor.tokenizer.add_special_tokens({'additional_special_tokens':sp})\n", 33 | "if model.qformer.embeddings.word_embeddings.weight.shape[0] != len(processor.qformer_tokenizer):\n", 34 | " model.qformer.resize_token_embeddings(len(processor.qformer_tokenizer))\n", 35 | "replace_token=\"\".join(32*[image_palceholder])" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 45, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "/tmp/ipykernel_38663/1570882833.py:3: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", 48 | " image = torch.tensor(image)\n" 49 | ] 50 | }, 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "['a flamingo standing in the water', 'flamingo']\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "from torch.nn.functional import pad\n", 61 | "def padd_images( image, max_length):\n", 62 | " image = torch.tensor(image)\n", 63 | " mask = torch.zeros(max_length).bool()\n", 64 | " pad_len = max_length - image.shape[0]\n", 65 | " mask[:image.shape[0]] = True\n", 66 | " image = pad(image,(0,0,0,0,0,0,0,pad_len)) # padding behind the first dim\n", 67 | " return image,mask\n", 68 | "\n", 69 | "image = Image.open (\"images/chinchilla.png\")\n", 70 | "image1 = Image.open (\"images/shiba.png\")\n", 71 | "image2 = Image.open (\"images/flamingo.png\")\n", 72 | "\n", 73 | "image4 = Image.open (\"images/shiba.png\")\n", 74 | "image5 = Image.open (\"images/flamingo.png\")\n", 75 | "\n", 76 | "images =[ \n", 77 | "[image,image1,image2], [image4 ,image5]\n", 78 | "]\n", 79 | "\n", 80 | "prompt = [f'image 0 is {replace_token},image 1 is {replace_token},image 2 is {replace_token}. Question: is a chinchilla. They are mainly found in Chile.\\n Question: is a shiba. They are very popular in Japan.\\nQuestion: image 2 is',\n", 81 | "f'image 0 is {replace_token}, image 0 is a shiba. They are very popular in Japan.\\n image 1 is {replace_token}, image 1 is a',\n", 82 | "]\n", 83 | "\n", 84 | "max_image_length = max([len(f) for f in images ])\n", 85 | "\n", 86 | "\n", 87 | "inputs = processor( text=prompt, return_tensors=\"pt\",padding=True) \n", 88 | "\n", 89 | "pixel_values= [ processor(images=img, return_tensors=\"pt\")['pixel_values'] for img in images]\n", 90 | "\n", 91 | "image_list=[]\n", 92 | "mask_list= []\n", 93 | "for img in pixel_values:\n", 94 | " image,img_mask = padd_images(img,max_image_length)\n", 95 | " image_list.append(image)\n", 96 | " mask_list.append(img_mask)\n", 97 | "inputs['pixel_values'] = torch.stack(image_list).to(torch.bfloat16)\n", 98 | "inputs['img_mask'] = torch.stack(mask_list)\n", 99 | "inputs = inputs.to('cuda:1')\n", 100 | "outputs = model.generate(\n", 101 | " pixel_values = inputs['pixel_values'],\n", 102 | " input_ids = inputs['input_ids'],\n", 103 | " attention_mask = inputs['attention_mask'],\n", 104 | " img_mask = inputs['img_mask'],\n", 105 | " do_sample=False,\n", 106 | " max_length=50,\n", 107 | " min_length=1,\n", 108 | " set_min_padding_size =False,\n", 109 | ")\n", 110 | "generated_text = processor.batch_decode(outputs, skip_special_tokens=True)\n", 111 | "print(generated_text)\n", 112 | "\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 23, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 42, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 43, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stderr", 136 | "output_type": "stream", 137 | "text": [ 138 | "/tmp/ipykernel_38663/2171299194.py:2: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", 139 | " image = torch.tensor(image)\n" 140 | ] 141 | }, 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "['a flamingo standing in the water', 'flamingo']\n" 147 | ] 148 | } 149 | ], 150 | "source": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 32, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "torch.Size([1, 2, 3, 3, 224, 224])" 161 | ] 162 | }, 163 | "execution_count": 32, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "inputs['pixel_values'].shape" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 41, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "3x6=18\"\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "image = Image.open (\"images/cal_num1.png\")\n", 187 | "image1 = Image.open (\"images/cal_num2.png\")\n", 188 | "image2 = Image.open (\"images/cal_num3.png\")\n", 189 | "images = [image,image1,image2]\n", 190 | "\n", 191 | "prompt = [f'Use the image 0: {replace_token},image 1: {replace_token} and image 2: {replace_token} as a visual aid to help you calculate the equation accurately. image 0 is 2+1=3.\\nimage 1 is 5+6=11.\\nimage 2 is\"']\n", 192 | "prompt = \" \".join(prompt)\n", 193 | "\n", 194 | "inputs = processor(images=images, text=prompt, return_tensors=\"pt\")\n", 195 | "\n", 196 | "inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)\n", 197 | "inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]])\n", 198 | "inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)\n", 199 | "\n", 200 | "inputs = inputs.to('cuda:0')\n", 201 | "outputs = model.generate(\n", 202 | " pixel_values = inputs['pixel_values'],\n", 203 | " input_ids = inputs['input_ids'],\n", 204 | " attention_mask = inputs['attention_mask'],\n", 205 | " img_mask = inputs['img_mask'],\n", 206 | " do_sample=False,\n", 207 | " max_length=50,\n", 208 | " min_length=1,\n", 209 | " set_min_padding_size =False,\n", 210 | ")\n", 211 | "generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()\n", 212 | "print(generated_text)\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 42, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "a flamingo. They are native to South America and are known for their bright red plumage and distinctive call.\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "\n", 230 | "image = Image.open (\"images/chinchilla.png\")\n", 231 | "image1 = Image.open (\"images/shiba.png\")\n", 232 | "image2 = Image.open (\"images/flamingo.png\")\n", 233 | "images = [image,image1,image2]\n", 234 | "images = [image,image1,image2]\n", 235 | "prompt = [f'image 0 is {replace_token},image 1 is {replace_token},image 2 is {replace_token}. Question: is a chinchilla. They are mainly found in Chile.\\n Question: is a shiba. They are very popular in Japan.\\nQuestion: image 2 is']\n", 236 | "\n", 237 | "prompt = \" \".join(prompt)\n", 238 | "\n", 239 | "inputs = processor(images=images, text=prompt, return_tensors=\"pt\")\n", 240 | "\n", 241 | "inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)\n", 242 | "inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]])\n", 243 | "inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)\n", 244 | "\n", 245 | "inputs = inputs.to('cuda:0')\n", 246 | "outputs = model.generate(\n", 247 | " pixel_values = inputs['pixel_values'],\n", 248 | " input_ids = inputs['input_ids'],\n", 249 | " attention_mask = inputs['attention_mask'],\n", 250 | " img_mask = inputs['img_mask'],\n", 251 | " do_sample=False,\n", 252 | " max_length=50,\n", 253 | " min_length=1,\n", 254 | " set_min_padding_size =False,\n", 255 | ")\n", 256 | "generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()\n", 257 | "print(generated_text)\n" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 119, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "image 0 is a photo of a flamingo standing in the water, image 1 is a cartoon drawing of a flamingo and image 2 is a low polygon count 3d model animation render of\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "image = Image.open (\"images/flamingo_photo.png\")\n", 275 | "image1 = Image.open (\"images/flamingo_cartoon.png\")\n", 276 | "image2 = Image.open (\"images/flamingo_3d.png\")\n", 277 | "\n", 278 | "images = [image,image1,image2]\n", 279 | "prompt = [f'Use the image 0: {replace_token}, image 1: {replace_token} and image 2: {replace_token} as a visual aids to help you answer the question. Question: Give the reason why image 0, image 1 and image 2 are different? Answer:']\n", 280 | "\n", 281 | "prompt = \" \".join(prompt)\n", 282 | "\n", 283 | "inputs = processor(images=images, text=prompt, return_tensors=\"pt\")\n", 284 | "\n", 285 | "inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)\n", 286 | "inputs['img_mask'] = torch.tensor([[1 for i in range(len(images))]])\n", 287 | "inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)\n", 288 | "\n", 289 | "inputs = inputs.to('cuda:0')\n", 290 | "outputs = model.generate(\n", 291 | " pixel_values = inputs['pixel_values'],\n", 292 | " input_ids = inputs['input_ids'],\n", 293 | " attention_mask = inputs['attention_mask'],\n", 294 | " img_mask = inputs['img_mask'],\n", 295 | " do_sample=False,\n", 296 | " max_length=80,\n", 297 | " min_length=50,\n", 298 | " num_beams=8,\n", 299 | " set_min_padding_size =False,\n", 300 | ")\n", 301 | "generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()\n", 302 | "print(generated_text)\n" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "nlp", 309 | "language": "python", 310 | "name": "python3" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.8.17" 323 | }, 324 | "orig_nbformat": 4 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 2 328 | } 329 | -------------------------------------------------------------------------------- /images/cal_num1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/cal_num1.png -------------------------------------------------------------------------------- /images/cal_num2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/cal_num2.png -------------------------------------------------------------------------------- /images/cal_num3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/cal_num3.png -------------------------------------------------------------------------------- /images/chinchilla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/chinchilla.png -------------------------------------------------------------------------------- /images/dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/dataset.png -------------------------------------------------------------------------------- /images/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/example.png -------------------------------------------------------------------------------- /images/example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/example2.png -------------------------------------------------------------------------------- /images/example3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/example3.png -------------------------------------------------------------------------------- /images/flamingo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/flamingo.png -------------------------------------------------------------------------------- /images/flamingo_3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/flamingo_3d.png -------------------------------------------------------------------------------- /images/flamingo_cartoon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/flamingo_cartoon.png -------------------------------------------------------------------------------- /images/flamingo_photo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/flamingo_photo.png -------------------------------------------------------------------------------- /images/model_struct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/model_struct.png -------------------------------------------------------------------------------- /images/new_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/new_example.png -------------------------------------------------------------------------------- /images/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/overview.png -------------------------------------------------------------------------------- /images/shiba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/images/shiba.png -------------------------------------------------------------------------------- /model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/model/.DS_Store -------------------------------------------------------------------------------- /model/blip2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/model/blip2/.DS_Store -------------------------------------------------------------------------------- /model/blip2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available 17 | 18 | 19 | _import_structure = { 20 | "configuration_blip_2": [ 21 | "BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP", 22 | "Blip2Config", 23 | "Blip2QFormerConfig", 24 | "Blip2VisionConfig", 25 | ], 26 | "processing_blip_2": ["Blip2Processor"], 27 | } 28 | 29 | try: 30 | if not is_torch_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["modeling_blip_2"] = [ 36 | "BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST", 37 | "Blip2QFormerModel", 38 | "Blip2PreTrainedModel", 39 | "Blip2ForConditionalGeneration", 40 | "Blip2VisionModel", 41 | ] 42 | 43 | if TYPE_CHECKING: 44 | from .configuration_blip_2 import ( 45 | BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP, 46 | Blip2Config, 47 | Blip2QFormerConfig, 48 | Blip2VisionConfig, 49 | ) 50 | from .processing_blip_2 import Blip2Processor 51 | 52 | try: 53 | if not is_torch_available(): 54 | raise OptionalDependencyNotAvailable() 55 | except OptionalDependencyNotAvailable: 56 | pass 57 | else: 58 | from .modeling_blip_2 import ( 59 | BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST, 60 | Blip2ForConditionalGeneration, 61 | Blip2PreTrainedModel, 62 | Blip2QFormerModel, 63 | Blip2VisionModel, 64 | ) 65 | 66 | else: 67 | import sys 68 | 69 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 70 | -------------------------------------------------------------------------------- /model/blip2/configuration_blip_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: JustBluce 972281745@qq.com 3 | Date: 2023-02-15 09:27:06 4 | LastEditors: JustBluce 972281745@qq.com 5 | LastEditTime: 2023-02-15 09:31:34 6 | FilePath: /VisionLanguagePromptSource/deepspeed-transformers/model/configuration_blip_2.py 7 | Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE 8 | ''' 9 | # coding=utf-8 10 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 11 | # 12 | # Licensed under the Apache License, Version 2.0 (the "License"); 13 | # you may not use this file except in compliance with the License. 14 | # You may obtain a copy of the License at 15 | # 16 | # http://www.apache.org/licenses/LICENSE-2.0 17 | # 18 | # Unless required by applicable law or agreed to in writing, software 19 | # distributed under the License is distributed on an "AS IS" BASIS, 20 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | # See the License for the specific language governing permissions and 22 | # limitations under the License. 23 | """ BLIP-2 model configuration""" 24 | 25 | import copy 26 | import os 27 | from typing import Union 28 | 29 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 30 | 31 | from transformers.configuration_utils import PretrainedConfig 32 | from transformers.utils import logging 33 | from transformers.models.auto import CONFIG_MAPPING 34 | 35 | 36 | logger = logging.get_logger(__name__) 37 | 38 | BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = { 39 | "salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json", 40 | } 41 | 42 | 43 | class Blip2VisionConfig(PretrainedConfig): 44 | r""" 45 | This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a 46 | BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a 47 | configuration defaults will yield a similar configuration to that of the BLIP-2 48 | [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. 49 | 50 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 51 | documentation from [`PretrainedConfig`] for more information. 52 | 53 | Args: 54 | hidden_size (`int`, *optional*, defaults to 1408): 55 | Dimensionality of the encoder layers and the pooler layer. 56 | intermediate_size (`int`, *optional*, defaults to 6144): 57 | Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 58 | num_hidden_layers (`int`, *optional*, defaults to 39): 59 | Number of hidden layers in the Transformer encoder. 60 | num_attention_heads (`int`, *optional*, defaults to 16): 61 | Number of attention heads for each attention layer in the Transformer encoder. 62 | image_size (`int`, *optional*, defaults to 224): 63 | The size (resolution) of each image. 64 | patch_size (`int`, *optional*, defaults to 14): 65 | The size (resolution) of each patch. 66 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 67 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 68 | `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults 69 | to 1e-5): The epsilon used by the layer normalization layers. 70 | dropout (`float`, *optional*, defaults to 0.0): 71 | The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. 72 | attention_dropout (`float`, *optional*, defaults to 0.0): 73 | The dropout ratio for the attention probabilities. 74 | initializer_range (`float`, *optional*, defaults to 0.02): 75 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 76 | initializer_factor (`float``, *optional*, defaults to 1): 77 | A factor for initializing all weight matrices (should be kept to 1, used internally for initialization 78 | testing). 79 | qkv_bias (`bool`, *optional*, defaults to `True`): 80 | Whether to add a bias to the queries and values in the self-attention layers. 81 | 82 | Example: 83 | 84 | ```python 85 | >>> from transformers import Blip2VisionConfig, Blip2VisionModel 86 | 87 | >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration 88 | >>> configuration = Blip2VisionConfig() 89 | 90 | >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration 91 | >>> model = Blip2VisionModel(configuration) 92 | 93 | >>> # Accessing the model configuration 94 | >>> configuration = model.config 95 | ```""" 96 | 97 | model_type = "blip_2_vision_model" 98 | 99 | def __init__( 100 | self, 101 | hidden_size=1408, 102 | intermediate_size=6144, 103 | projection_dim=512, 104 | num_hidden_layers=39, 105 | num_attention_heads=16, 106 | num_channels=3, 107 | image_size=224, 108 | patch_size=14, 109 | hidden_act="gelu", 110 | layer_norm_eps=0.00001, 111 | dropout=0.0, 112 | attention_dropout=0.0, 113 | initializer_range=1e-10, 114 | initializer_factor=1.0, 115 | qkv_bias=True, 116 | **kwargs, 117 | ): 118 | super().__init__(**kwargs) 119 | 120 | self.hidden_size = hidden_size 121 | self.intermediate_size = intermediate_size 122 | self.projection_dim = projection_dim 123 | self.dropout = dropout 124 | self.num_hidden_layers = num_hidden_layers 125 | self.num_attention_heads = num_attention_heads 126 | self.num_channels = num_channels 127 | self.patch_size = patch_size 128 | self.image_size = image_size 129 | self.initializer_range = initializer_range 130 | self.initializer_factor = initializer_factor 131 | self.attention_dropout = attention_dropout 132 | self.layer_norm_eps = layer_norm_eps 133 | self.hidden_act = hidden_act 134 | self.qkv_bias = qkv_bias 135 | 136 | @classmethod 137 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": 138 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 139 | 140 | # get the vision config dict if we are loading from Blip2Config 141 | if config_dict.get("model_type") == "blip-2": 142 | config_dict = config_dict["vision_config"] 143 | 144 | if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: 145 | logger.warning( 146 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 147 | f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." 148 | ) 149 | 150 | return cls.from_dict(config_dict, **kwargs) 151 | 152 | 153 | class Blip2QFormerConfig(PretrainedConfig): 154 | r""" 155 | This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a 156 | BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. 157 | Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2 158 | [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects 159 | inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from 160 | [`PretrainedConfig`] for more information. 161 | 162 | Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. 163 | 164 | Args: 165 | vocab_size (`int`, *optional*, defaults to 30522): 166 | Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by 167 | the `inputs_ids` passed when calling the model. 168 | hidden_size (`int`, *optional*, defaults to 768): 169 | Dimensionality of the encoder layers and the pooler layer. 170 | num_hidden_layers (`int`, *optional*, defaults to 12): 171 | Number of hidden layers in the Transformer encoder. 172 | num_attention_heads (`int`, *optional*, defaults to 12): 173 | Number of attention heads for each attention layer in the Transformer encoder. 174 | intermediate_size (`int`, *optional*, defaults to 3072): 175 | Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. 176 | hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): 177 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 178 | `"relu"`, `"silu"` and `"gelu_new"` are supported. 179 | hidden_dropout_prob (`float`, *optional*, defaults to 0.1): 180 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 181 | attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): 182 | The dropout ratio for the attention probabilities. 183 | max_position_embeddings (`int`, *optional*, defaults to 512): 184 | The maximum sequence length that this model might ever be used with. Typically set this to something large 185 | just in case (e.g., 512 or 1024 or 2048). 186 | initializer_range (`float`, *optional*, defaults to 0.02): 187 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 188 | layer_norm_eps (`float`, *optional*, defaults to 1e-12): 189 | The epsilon used by the layer normalization layers. 190 | position_embedding_type (`str`, *optional*, defaults to `"absolute"`): 191 | Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For 192 | positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to 193 | [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). 194 | For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models 195 | with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). 196 | classifier_dropout (`float`, *optional*): 197 | The dropout ratio for the classification head. 198 | cross_attention_frequency (`int`, *optional*, defaults to 2): 199 | The frequency of adding cross-attention to the Transformer layers. 200 | encoder_hidden_size (`int`, *optional*, defaults to 1408): 201 | The hidden size of the hidden states for cross-attention. 202 | 203 | Examples: 204 | 205 | ```python 206 | >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel 207 | 208 | >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration 209 | >>> configuration = Blip2QFormerConfig() 210 | 211 | >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration 212 | >>> model = Blip2QFormerModel(configuration) 213 | >>> # Accessing the model configuration 214 | >>> configuration = model.config 215 | ```""" 216 | model_type = "blip_2_qformer" 217 | 218 | def __init__( 219 | self, 220 | vocab_size=30522, 221 | hidden_size=768, 222 | num_hidden_layers=12, 223 | num_attention_heads=12, 224 | intermediate_size=3072, 225 | hidden_act="gelu", 226 | hidden_dropout_prob=0.1, 227 | attention_probs_dropout_prob=0.1, 228 | max_position_embeddings=512, 229 | initializer_range=0.02, 230 | layer_norm_eps=1e-12, 231 | pad_token_id=0, 232 | position_embedding_type="absolute", 233 | classifier_dropout=None, 234 | cross_attention_frequency=2, 235 | encoder_hidden_size=1408, 236 | **kwargs, 237 | ): 238 | super().__init__(pad_token_id=pad_token_id, **kwargs) 239 | 240 | self.vocab_size = vocab_size 241 | self.hidden_size = hidden_size 242 | self.num_hidden_layers = num_hidden_layers 243 | self.num_attention_heads = num_attention_heads 244 | self.hidden_act = hidden_act 245 | self.intermediate_size = intermediate_size 246 | self.hidden_dropout_prob = hidden_dropout_prob 247 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 248 | self.max_position_embeddings = max_position_embeddings 249 | self.initializer_range = initializer_range 250 | self.layer_norm_eps = layer_norm_eps 251 | self.position_embedding_type = position_embedding_type 252 | self.classifier_dropout = classifier_dropout 253 | self.cross_attention_frequency = cross_attention_frequency 254 | self.encoder_hidden_size = encoder_hidden_size 255 | 256 | @classmethod 257 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": 258 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 259 | 260 | # get the qformer config dict if we are loading from Blip2Config 261 | if config_dict.get("model_type") == "blip-2": 262 | config_dict = config_dict["qformer_config"] 263 | 264 | if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: 265 | logger.warning( 266 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 267 | f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." 268 | ) 269 | 270 | return cls.from_dict(config_dict, **kwargs) 271 | 272 | 273 | class Blip2Config(PretrainedConfig): 274 | r""" 275 | [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is 276 | used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model 277 | and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to 278 | that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. 279 | 280 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 281 | documentation from [`PretrainedConfig`] for more information. 282 | 283 | Args: 284 | vision_config (`dict`, *optional*): 285 | Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. 286 | qformer_config (`dict`, *optional*): 287 | Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. 288 | text_config (`dict`, *optional*): 289 | Dictionary of configuration options used to initialize any [`PretrainedConfig`]. 290 | num_query_tokens (`int`, *optional*, defaults to 32): 291 | The number of query tokens passed through the Transformer. 292 | 293 | kwargs (*optional*): 294 | Dictionary of keyword arguments. 295 | 296 | Example: 297 | 298 | ```python 299 | >>> from transformers import ( 300 | ... Blip2VisionConfig, 301 | ... Blip2QFormerConfig, 302 | ... OPTConfig, 303 | ... Blip2Config, 304 | ... Blip2ForConditionalGeneration, 305 | ... ) 306 | 307 | >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration 308 | >>> configuration = Blip2Config() 309 | 310 | >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration 311 | >>> model = Blip2ForConditionalGeneration(configuration) 312 | 313 | >>> # Accessing the model configuration 314 | >>> configuration = model.config 315 | 316 | >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig 317 | 318 | >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations 319 | >>> vision_config = Blip2VisionConfig() 320 | >>> qformer_config = Blip2QFormerConfig() 321 | >>> text_config = OPTConfig() 322 | 323 | >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config) 324 | ```""" 325 | 326 | model_type = "blip-2" 327 | is_composition = True 328 | 329 | def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): 330 | super().__init__(**kwargs) 331 | 332 | if vision_config is None: 333 | vision_config = {} 334 | logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.") 335 | 336 | if qformer_config is None: 337 | qformer_config = {} 338 | logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.") 339 | 340 | if text_config is None: 341 | text_config = {} 342 | logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") 343 | 344 | self.vision_config = Blip2VisionConfig(**vision_config) 345 | self.qformer_config = Blip2QFormerConfig(**qformer_config) 346 | text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" 347 | self.text_config = CONFIG_MAPPING[text_model_type](**text_config) 348 | 349 | self.tie_word_embeddings = self.text_config.tie_word_embeddings 350 | self.is_encoder_decoder = self.text_config.is_encoder_decoder 351 | 352 | self.num_query_tokens = num_query_tokens 353 | self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size 354 | self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 355 | self.initializer_factor = 1.0 356 | self.initializer_range = 0.02 357 | 358 | @classmethod 359 | def from_vision_qformer_text_configs( 360 | cls, 361 | vision_config: Blip2VisionConfig, 362 | qformer_config: Blip2QFormerConfig, 363 | text_config: PretrainedConfig, 364 | **kwargs, 365 | ): 366 | r""" 367 | Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model 368 | configurations. 369 | 370 | Returns: 371 | [`Blip2Config`]: An instance of a configuration object 372 | """ 373 | 374 | return cls( 375 | vision_config=vision_config.to_dict(), 376 | qformer_config=qformer_config.to_dict(), 377 | text_config=text_config.to_dict(), 378 | **kwargs, 379 | ) 380 | 381 | def to_dict(self): 382 | """ 383 | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. 384 | 385 | Returns: 386 | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, 387 | """ 388 | output = copy.deepcopy(self.__dict__) 389 | output["vision_config"] = self.vision_config.to_dict() 390 | output["qformer_config"] = self.qformer_config.to_dict() 391 | output["text_config"] = self.text_config.to_dict() 392 | output["model_type"] = self.__class__.model_type 393 | return output 394 | -------------------------------------------------------------------------------- /model/blip2/convert_blip_2_original_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Convert BLIP-2 checkpoints from the original repository. 17 | 18 | URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2 19 | """ 20 | 21 | import argparse 22 | 23 | import requests 24 | import torch 25 | 26 | # pip3 install salesforce-lavis 27 | # I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis 28 | from lavis.models import load_model_and_preprocess 29 | from PIL import Image 30 | 31 | from transformers import ( 32 | AutoTokenizer, 33 | Blip2Config, 34 | Blip2ForConditionalGeneration, 35 | Blip2Processor, 36 | Blip2VisionConfig, 37 | BlipImageProcessor, 38 | OPTConfig, 39 | T5Config, 40 | ) 41 | from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD 42 | 43 | 44 | def load_demo_image(): 45 | url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png" 46 | image = Image.open(requests.get(url, stream=True).raw).convert("RGB") 47 | 48 | return image 49 | 50 | 51 | # here we list all keys to be renamed (original name on the left, our name on the right) 52 | def create_rename_keys(config): 53 | rename_keys = [] 54 | # fmt: off 55 | 56 | # vision encoder 57 | rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) 58 | rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) 59 | rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) 60 | rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) 61 | rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) 62 | rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) 63 | 64 | for i in range(config.vision_config.num_hidden_layers): 65 | rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) 66 | rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) 67 | rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) 68 | rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) 69 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) 70 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) 71 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) 72 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) 73 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) 74 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) 75 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) 76 | 77 | # QFormer 78 | rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) 79 | rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) 80 | 81 | # fmt: on 82 | return rename_keys 83 | 84 | 85 | def rename_key(dct, old, new): 86 | val = dct.pop(old) 87 | dct[new] = val 88 | 89 | 90 | def read_in_q_v_bias(state_dict, config): 91 | for i in range(config.vision_config.num_hidden_layers): 92 | # read in original q and v biases 93 | q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") 94 | v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") 95 | 96 | # next, set bias in the state dict 97 | qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) 98 | state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias 99 | 100 | 101 | def get_blip2_config(model_name, eos_token_id): 102 | image_size = 364 if "coco" in model_name else 224 103 | vision_config = Blip2VisionConfig(image_size=image_size).to_dict() 104 | 105 | # make sure the models have proper bos_token_id and eos_token_id set (important for generation) 106 | # seems like flan-T5 models don't have bos_token_id properly set? 107 | if "opt-2.7b" in model_name: 108 | text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict() 109 | elif "opt-6.7b" in model_name: 110 | text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict() 111 | elif "t5-xl" in model_name: 112 | text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() 113 | elif "t5-xxl" in model_name: 114 | text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() 115 | 116 | config = Blip2Config(vision_config=vision_config, text_config=text_config) 117 | 118 | return config, image_size 119 | 120 | 121 | @torch.no_grad() 122 | def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): 123 | """ 124 | Copy/paste/tweak model's weights to Transformers design. 125 | """ 126 | tokenizer = ( 127 | AutoTokenizer.from_pretrained("facebook/opt-2.7b") 128 | if "opt" in model_name 129 | else AutoTokenizer.from_pretrained("google/flan-t5-xl") 130 | ) 131 | eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] 132 | config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) 133 | 134 | hf_model = Blip2ForConditionalGeneration(config).eval() 135 | 136 | model_name_to_original = { 137 | "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), 138 | "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"), 139 | "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"), 140 | "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"), 141 | "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), 142 | "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), 143 | "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), 144 | } 145 | 146 | name, type = model_name_to_original[model_name] 147 | 148 | # load original model 149 | print("Loading original model...") 150 | device = "cuda" if torch.cuda.is_available() else "cpu" 151 | original_model, vis_processors, _ = load_model_and_preprocess( 152 | name=name, model_type=type, is_eval=True, device=device 153 | ) 154 | original_model.eval() 155 | print("Done!") 156 | 157 | # update state dict keys 158 | state_dict = original_model.state_dict() 159 | rename_keys = create_rename_keys(config) 160 | for src, dest in rename_keys: 161 | rename_key(state_dict, src, dest) 162 | 163 | # some keys can be renamed efficiently 164 | for key, val in state_dict.copy().items(): 165 | val = state_dict.pop(key) 166 | if key.startswith("Qformer.bert"): 167 | key = key.replace("Qformer.bert", "qformer") 168 | if "attention.self" in key: 169 | key = key.replace("self", "attention") 170 | if "opt_proj" in key: 171 | key = key.replace("opt_proj", "language_projection") 172 | if "t5_proj" in key: 173 | key = key.replace("t5_proj", "language_projection") 174 | if key.startswith("opt"): 175 | key = key.replace("opt", "language") 176 | if key.startswith("t5"): 177 | key = key.replace("t5", "language") 178 | state_dict[key] = val 179 | 180 | # read in qv biases 181 | read_in_q_v_bias(state_dict, config) 182 | 183 | missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) 184 | assert len(missing_keys) == 0 185 | assert unexpected_keys == ["qformer.embeddings.position_ids"] 186 | 187 | image = load_demo_image() 188 | original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(device) 189 | input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(device) 190 | 191 | # create processor 192 | image_processor = BlipImageProcessor( 193 | size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD 194 | ) 195 | processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer) 196 | pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device) 197 | 198 | # make sure processor creates exact same pixel values 199 | assert torch.allclose(pixel_values, original_pixel_values) 200 | 201 | original_model.to(device) 202 | hf_model.to(device) 203 | with torch.no_grad(): 204 | if "opt" in model_name: 205 | original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits 206 | logits = hf_model(original_pixel_values, input_ids).logits 207 | else: 208 | original_logits = original_model( 209 | {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} 210 | ).logits 211 | labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) 212 | logits = hf_model(original_pixel_values, input_ids, labels=labels).logits 213 | 214 | assert original_logits.shape == logits.shape 215 | print("First values of original logits:", original_logits[0, :3, :3]) 216 | print("First values of HF logits:", logits[0, :3, :3]) 217 | 218 | # assert values 219 | if model_name == "blip2-flan-t5-xl": 220 | expected_slice_logits = torch.tensor( 221 | [[-41.5850, -4.4440, -8.9922], [-47.4322, -5.9143, -1.7340]], device=device 222 | ) 223 | assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4) 224 | elif model_name == "blip2-flan-t5-xl-coco": 225 | expected_slice_logits = torch.tensor( 226 | [[-57.0109, -9.8967, -12.6280], [-68.6578, -12.7191, -10.5065]], device=device 227 | ) 228 | else: 229 | # cast to same type 230 | target_dtype = logits.dtype 231 | assert torch.allclose(original_logits.to(target_dtype), logits, atol=1e-2) 232 | print("Looks ok!") 233 | 234 | print("Generating a caption...") 235 | prompt = "" 236 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) 237 | 238 | original_outputs = original_model.generate({"image": original_pixel_values}) 239 | outputs = hf_model.generate( 240 | original_pixel_values, 241 | input_ids, 242 | do_sample=False, 243 | num_beams=5, 244 | max_length=30, 245 | min_length=1, 246 | top_p=0.9, 247 | repetition_penalty=1.0, 248 | length_penalty=1.0, 249 | temperature=1, 250 | ) 251 | print("Original generation:", original_outputs) 252 | prompt_length = input_ids.shape[1] 253 | output_text = processor.batch_decode(outputs[:, prompt_length:], skip_special_tokens=True) 254 | output_text = [text.strip() for text in output_text] 255 | print("HF generation:", output_text) 256 | 257 | if pytorch_dump_folder_path is not None: 258 | processor.save_pretrained(pytorch_dump_folder_path) 259 | hf_model.save_pretrained(pytorch_dump_folder_path) 260 | 261 | if push_to_hub: 262 | processor.push_to_hub(f"nielsr/{model_name}") 263 | hf_model.push_to_hub(f"nielsr/{model_name}") 264 | 265 | 266 | if __name__ == "__main__": 267 | parser = argparse.ArgumentParser() 268 | choices = [ 269 | "blip2-opt-2.7b", 270 | "blip2-opt-6.7b", 271 | "blip2-opt-2.7b-coco", 272 | "blip2-opt-6.7b-coco", 273 | "blip2-flan-t5-xl", 274 | "blip2-flan-t5-xl-coco", 275 | "blip2-flan-t5-xxl", 276 | ] 277 | parser.add_argument( 278 | "--model_name", 279 | default="blip2-opt-2.7b", 280 | choices=choices, 281 | type=str, 282 | help="Path to hf config.json of model to convert", 283 | ) 284 | parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 285 | parser.add_argument( 286 | "--push_to_hub", 287 | action="store_true", 288 | help="Whether to push the model and processor to the hub after converting", 289 | ) 290 | 291 | args = parser.parse_args() 292 | 293 | convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) 294 | -------------------------------------------------------------------------------- /model/blip2/processing_blip_2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Processor class for BLIP-2. 17 | """ 18 | 19 | from typing import List, Optional, Union 20 | 21 | from transformers.processing_utils import ProcessorMixin 22 | from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy 23 | from transformers.utils import TensorType 24 | 25 | 26 | class Blip2Processor(ProcessorMixin): 27 | r""" 28 | Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor. 29 | 30 | [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring 31 | of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. 32 | 33 | Args: 34 | image_processor (`BlipImageProcessor`): 35 | An instance of [`BlipImageProcessor`]. The image processor is a required input. 36 | tokenizer (`AutoTokenizer`): 37 | An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. 38 | """ 39 | attributes = ["image_processor", "tokenizer"] 40 | image_processor_class = "BlipImageProcessor" 41 | tokenizer_class = "AutoTokenizer" 42 | 43 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__ 44 | def __init__(self, image_processor, tokenizer): 45 | tokenizer.return_token_type_ids = False 46 | super().__init__(image_processor, tokenizer) 47 | self.current_processor = self.image_processor 48 | 49 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__ 50 | def __call__( 51 | self, 52 | images=None, 53 | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, 54 | add_special_tokens: bool = True, 55 | padding: Union[bool, str, PaddingStrategy] = False, 56 | truncation: Union[bool, str, TruncationStrategy] = None, 57 | max_length: Optional[int] = None, 58 | stride: int = 0, 59 | pad_to_multiple_of: Optional[int] = None, 60 | return_attention_mask: Optional[bool] = None, 61 | return_overflowing_tokens: bool = False, 62 | return_special_tokens_mask: bool = False, 63 | return_offsets_mapping: bool = False, 64 | return_token_type_ids: bool = False, 65 | return_length: bool = False, 66 | verbose: bool = True, 67 | return_tensors: Optional[Union[str, TensorType]] = None, 68 | **kwargs, 69 | ) -> BatchEncoding: 70 | """ 71 | This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and 72 | [`BertTokenizerFast.__call__`] to prepare text for the model. 73 | 74 | Please refer to the docstring of the above two methods for more information. 75 | """ 76 | if images is None and text is None: 77 | raise ValueError("You have to specify either images or text.") 78 | 79 | # Get only text 80 | if images is None: 81 | self.current_processor = self.tokenizer 82 | text_encoding = self.tokenizer( 83 | text=text, 84 | add_special_tokens=add_special_tokens, 85 | padding=padding, 86 | truncation=truncation, 87 | max_length=max_length, 88 | stride=stride, 89 | pad_to_multiple_of=pad_to_multiple_of, 90 | return_attention_mask=return_attention_mask, 91 | return_overflowing_tokens=return_overflowing_tokens, 92 | return_special_tokens_mask=return_special_tokens_mask, 93 | return_offsets_mapping=return_offsets_mapping, 94 | return_token_type_ids=return_token_type_ids, 95 | return_length=return_length, 96 | verbose=verbose, 97 | return_tensors=return_tensors, 98 | **kwargs, 99 | ) 100 | return text_encoding 101 | 102 | # add pixel_values 103 | encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) 104 | 105 | if text is not None: 106 | text_encoding = self.tokenizer( 107 | text=text, 108 | add_special_tokens=add_special_tokens, 109 | padding=padding, 110 | truncation=truncation, 111 | max_length=max_length, 112 | stride=stride, 113 | pad_to_multiple_of=pad_to_multiple_of, 114 | return_attention_mask=return_attention_mask, 115 | return_overflowing_tokens=return_overflowing_tokens, 116 | return_special_tokens_mask=return_special_tokens_mask, 117 | return_offsets_mapping=return_offsets_mapping, 118 | return_token_type_ids=return_token_type_ids, 119 | return_length=return_length, 120 | verbose=verbose, 121 | return_tensors=return_tensors, 122 | **kwargs, 123 | ) 124 | else: 125 | text_encoding = None 126 | 127 | if text_encoding is not None: 128 | encoding_image_processor.update(text_encoding) 129 | 130 | return encoding_image_processor 131 | 132 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer 133 | def batch_decode(self, *args, **kwargs): 134 | """ 135 | This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please 136 | refer to the docstring of this method for more information. 137 | """ 138 | return self.tokenizer.batch_decode(*args, **kwargs) 139 | 140 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer 141 | def decode(self, *args, **kwargs): 142 | """ 143 | This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer 144 | to the docstring of this method for more information. 145 | """ 146 | return self.tokenizer.decode(*args, **kwargs) 147 | 148 | @property 149 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names 150 | def model_input_names(self): 151 | tokenizer_input_names = self.tokenizer.model_input_names 152 | image_processor_input_names = self.image_processor.model_input_names 153 | return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) 154 | -------------------------------------------------------------------------------- /model/instructblip/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available 17 | 18 | 19 | _import_structure = { 20 | "configuration_instructblip": [ 21 | "INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", 22 | "InstructBlipConfig", 23 | "InstructBlipQFormerConfig", 24 | "InstructBlipVisionConfig", 25 | ], 26 | "processing_instructblip": ["InstructBlipProcessor"], 27 | } 28 | 29 | try: 30 | if not is_torch_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["modeling_instructblip"] = [ 36 | "INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST", 37 | "InstructBlipModel", 38 | "InstructBlipQFormerModel", 39 | "InstructBlipPreTrainedModel", 40 | "InstructBlipForConditionalGeneration", 41 | "InstructBlipVisionModel", 42 | ] 43 | 44 | if TYPE_CHECKING: 45 | from .configuration_instructblip import ( 46 | INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, 47 | InstructBlipConfig, 48 | InstructBlipQFormerConfig, 49 | InstructBlipVisionConfig, 50 | ) 51 | from .processing_instructblip import InstructBlipProcessor 52 | 53 | try: 54 | if not is_torch_available(): 55 | raise OptionalDependencyNotAvailable() 56 | except OptionalDependencyNotAvailable: 57 | pass 58 | else: 59 | from .modeling_instructblip import ( 60 | INSTRUCTBLIP_PRETRAINED_MODEL_ARCHIVE_LIST, 61 | InstructBlipForConditionalGeneration, 62 | InstructBlipModel, 63 | InstructBlipPreTrainedModel, 64 | InstructBlipQFormerModel, 65 | InstructBlipVisionModel, 66 | ) 67 | 68 | else: 69 | import sys 70 | 71 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 72 | -------------------------------------------------------------------------------- /model/instructblip/configuration_instructblip.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ InstructBLIP model configuration""" 16 | 17 | import copy 18 | import os 19 | from typing import Union 20 | 21 | from transformers.configuration_utils import PretrainedConfig 22 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 23 | from transformers.utils import logging 24 | from transformers.models.auto.configuration_auto import CONFIG_MAPPING 25 | 26 | 27 | logger = logging.get_logger(__name__) 28 | 29 | INSTRUCTBLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | "Salesforce/instruct-blip-flan-t5": "https://huggingface.co/Salesforce/instruct-blip-flan-t5/resolve/main/config.json", 31 | } 32 | 33 | 34 | class InstructBlipVisionConfig(PretrainedConfig): 35 | r""" 36 | This is the configuration class to store the configuration of a [`InstructBlipVisionModel`]. It is used to 37 | instantiate a InstructBLIP vision encoder according to the specified arguments, defining the model architecture. 38 | Instantiating a configuration defaults will yield a similar configuration to that of the InstructBLIP 39 | [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. 40 | 41 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 42 | documentation from [`PretrainedConfig`] for more information. 43 | 44 | Args: 45 | hidden_size (`int`, *optional*, defaults to 1408): 46 | Dimensionality of the encoder layers and the pooler layer. 47 | intermediate_size (`int`, *optional*, defaults to 6144): 48 | Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 49 | num_hidden_layers (`int`, *optional*, defaults to 39): 50 | Number of hidden layers in the Transformer encoder. 51 | num_attention_heads (`int`, *optional*, defaults to 16): 52 | Number of attention heads for each attention layer in the Transformer encoder. 53 | image_size (`int`, *optional*, defaults to 224): 54 | The size (resolution) of each image. 55 | patch_size (`int`, *optional*, defaults to 14): 56 | The size (resolution) of each patch. 57 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 58 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 59 | `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults 60 | to 1e-5): The epsilon used by the layer normalization layers. 61 | dropout (`float`, *optional*, defaults to 0.0): 62 | The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. 63 | attention_dropout (`float`, *optional*, defaults to 0.0): 64 | The dropout ratio for the attention probabilities. 65 | initializer_range (`float`, *optional*, defaults to 0.02): 66 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 67 | initializer_factor (`float``, *optional*, defaults to 1): 68 | A factor for initializing all weight matrices (should be kept to 1, used internally for initialization 69 | testing). 70 | qkv_bias (`bool`, *optional*, defaults to `True`): 71 | Whether to add a bias to the queries and values in the self-attention layers. 72 | 73 | Example: 74 | 75 | ```python 76 | >>> from transformers import InstructBlipVisionConfig, InstructBlipVisionModel 77 | 78 | >>> # Initializing a InstructBlipVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration 79 | >>> configuration = InstructBlipVisionConfig() 80 | 81 | >>> # Initializing a InstructBlipVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration 82 | >>> model = InstructBlipVisionModel(configuration) 83 | 84 | >>> # Accessing the model configuration 85 | >>> configuration = model.config 86 | ```""" 87 | 88 | model_type = "instructblip_vision_model" 89 | 90 | def __init__( 91 | self, 92 | hidden_size=1408, 93 | intermediate_size=6144, 94 | projection_dim=512, 95 | num_hidden_layers=39, 96 | num_attention_heads=16, 97 | num_channels=3, 98 | image_size=224, 99 | patch_size=14, 100 | hidden_act="gelu", 101 | layer_norm_eps=0.00001, 102 | dropout=0.0, 103 | attention_dropout=0.0, 104 | initializer_range=1e-10, 105 | initializer_factor=1.0, 106 | qkv_bias=True, 107 | **kwargs, 108 | ): 109 | super().__init__(**kwargs) 110 | 111 | self.hidden_size = hidden_size 112 | self.intermediate_size = intermediate_size 113 | self.projection_dim = projection_dim 114 | self.dropout = dropout 115 | self.num_hidden_layers = num_hidden_layers 116 | self.num_attention_heads = num_attention_heads 117 | self.num_channels = num_channels 118 | self.patch_size = patch_size 119 | self.image_size = image_size 120 | self.initializer_range = initializer_range 121 | self.initializer_factor = initializer_factor 122 | self.attention_dropout = attention_dropout 123 | self.layer_norm_eps = layer_norm_eps 124 | self.hidden_act = hidden_act 125 | self.qkv_bias = qkv_bias 126 | 127 | @classmethod 128 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": 129 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 130 | 131 | # get the vision config dict if we are loading from InstructBlipConfig 132 | if config_dict.get("model_type") == "instructblip": 133 | config_dict = config_dict["vision_config"] 134 | 135 | if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: 136 | logger.warning( 137 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 138 | f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." 139 | ) 140 | 141 | return cls.from_dict(config_dict, **kwargs) 142 | 143 | 144 | class InstructBlipQFormerConfig(PretrainedConfig): 145 | r""" 146 | This is the configuration class to store the configuration of a [`InstructBlipQFormerModel`]. It is used to 147 | instantiate a InstructBLIP Querying Transformer (Q-Former) model according to the specified arguments, defining the 148 | model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of 149 | the InstructBLIP [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) 150 | architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. 151 | Read the documentation from [`PretrainedConfig`] for more information. 152 | 153 | Note that [`InstructBlipQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. 154 | 155 | Args: 156 | vocab_size (`int`, *optional*, defaults to 30522): 157 | Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by 158 | the `inputs_ids` passed when calling the model. 159 | hidden_size (`int`, *optional*, defaults to 768): 160 | Dimensionality of the encoder layers and the pooler layer. 161 | num_hidden_layers (`int`, *optional*, defaults to 12): 162 | Number of hidden layers in the Transformer encoder. 163 | num_attention_heads (`int`, *optional*, defaults to 12): 164 | Number of attention heads for each attention layer in the Transformer encoder. 165 | intermediate_size (`int`, *optional*, defaults to 3072): 166 | Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. 167 | hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): 168 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 169 | `"relu"`, `"silu"` and `"gelu_new"` are supported. 170 | hidden_dropout_prob (`float`, *optional*, defaults to 0.1): 171 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 172 | attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): 173 | The dropout ratio for the attention probabilities. 174 | max_position_embeddings (`int`, *optional*, defaults to 512): 175 | The maximum sequence length that this model might ever be used with. Typically set this to something large 176 | just in case (e.g., 512 or 1024 or 2048). 177 | initializer_range (`float`, *optional*, defaults to 0.02): 178 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 179 | layer_norm_eps (`float`, *optional*, defaults to 1e-12): 180 | The epsilon used by the layer normalization layers. 181 | position_embedding_type (`str`, *optional*, defaults to `"absolute"`): 182 | Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For 183 | positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to 184 | [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). 185 | For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models 186 | with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). 187 | classifier_dropout (`float`, *optional*): 188 | The dropout ratio for the classification head. 189 | cross_attention_frequency (`int`, *optional*, defaults to 2): 190 | The frequency of adding cross-attention to the Transformer layers. 191 | encoder_hidden_size (`int`, *optional*, defaults to 1408): 192 | The hidden size of the hidden states for cross-attention. 193 | 194 | Examples: 195 | 196 | ```python 197 | >>> from transformers import InstructBlipQFormerConfig, InstructBlipQFormerModel 198 | 199 | >>> # Initializing a InstructBLIP Salesforce/instruct-blip-flan-t5 style configuration 200 | >>> configuration = InstructBlipQFormerConfig() 201 | 202 | >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration 203 | >>> model = InstructBlipQFormerModel(configuration) 204 | >>> # Accessing the model configuration 205 | >>> configuration = model.config 206 | ```""" 207 | model_type = "instructblip_qformer" 208 | 209 | def __init__( 210 | self, 211 | vocab_size=30522, 212 | hidden_size=768, 213 | num_hidden_layers=12, 214 | num_attention_heads=12, 215 | intermediate_size=3072, 216 | hidden_act="gelu", 217 | hidden_dropout_prob=0.1, 218 | attention_probs_dropout_prob=0.1, 219 | max_position_embeddings=512, 220 | initializer_range=0.02, 221 | layer_norm_eps=1e-12, 222 | pad_token_id=0, 223 | position_embedding_type="absolute", 224 | classifier_dropout=None, 225 | cross_attention_frequency=2, 226 | encoder_hidden_size=1408, 227 | **kwargs, 228 | ): 229 | super().__init__(pad_token_id=pad_token_id, **kwargs) 230 | 231 | self.vocab_size = vocab_size 232 | self.hidden_size = hidden_size 233 | self.num_hidden_layers = num_hidden_layers 234 | self.num_attention_heads = num_attention_heads 235 | self.hidden_act = hidden_act 236 | self.intermediate_size = intermediate_size 237 | self.hidden_dropout_prob = hidden_dropout_prob 238 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 239 | self.max_position_embeddings = max_position_embeddings 240 | self.initializer_range = initializer_range 241 | self.layer_norm_eps = layer_norm_eps 242 | self.position_embedding_type = position_embedding_type 243 | self.classifier_dropout = classifier_dropout 244 | self.cross_attention_frequency = cross_attention_frequency 245 | self.encoder_hidden_size = encoder_hidden_size 246 | 247 | @classmethod 248 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": 249 | config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) 250 | 251 | # get the qformer config dict if we are loading from InstructBlipConfig 252 | if config_dict.get("model_type") == "instructblip": 253 | config_dict = config_dict["qformer_config"] 254 | 255 | if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: 256 | logger.warning( 257 | f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " 258 | f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." 259 | ) 260 | 261 | return cls.from_dict(config_dict, **kwargs) 262 | 263 | 264 | class InstructBlipConfig(PretrainedConfig): 265 | r""" 266 | [`InstructBlipConfig`] is the configuration class to store the configuration of a 267 | [`InstructBlipForConditionalGeneration`]. It is used to instantiate a InstructBLIP model according to the specified 268 | arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with 269 | the defaults will yield a similar configuration to that of the InstructBLIP 270 | [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. 271 | 272 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 273 | documentation from [`PretrainedConfig`] for more information. 274 | 275 | Args: 276 | vision_config (`dict`, *optional*): 277 | Dictionary of configuration options used to initialize [`InstructBlipVisionConfig`]. 278 | qformer_config (`dict`, *optional*): 279 | Dictionary of configuration options used to initialize [`InstructBlipQFormerConfig`]. 280 | text_config (`dict`, *optional*): 281 | Dictionary of configuration options used to initialize any [`PretrainedConfig`]. 282 | num_query_tokens (`int`, *optional*, defaults to 32): 283 | The number of query tokens passed through the Transformer. 284 | 285 | kwargs (*optional*): 286 | Dictionary of keyword arguments. 287 | 288 | Example: 289 | 290 | ```python 291 | >>> from transformers import ( 292 | ... InstructBlipVisionConfig, 293 | ... InstructBlipQFormerConfig, 294 | ... OPTConfig, 295 | ... InstructBlipConfig, 296 | ... InstructBlipForConditionalGeneration, 297 | ... ) 298 | 299 | >>> # Initializing a InstructBlipConfig with Salesforce/instruct-blip-flan-t5 style configuration 300 | >>> configuration = InstructBlipConfig() 301 | 302 | >>> # Initializing a InstructBlipForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration 303 | >>> model = InstructBlipForConditionalGeneration(configuration) 304 | 305 | >>> # Accessing the model configuration 306 | >>> configuration = model.config 307 | 308 | >>> # We can also initialize a InstructBlipConfig from a InstructBlipVisionConfig, InstructBlipQFormerConfig and any PretrainedConfig 309 | 310 | >>> # Initializing InstructBLIP vision, InstructBLIP Q-Former and language model configurations 311 | >>> vision_config = InstructBlipVisionConfig() 312 | >>> qformer_config = InstructBlipQFormerConfig() 313 | >>> text_config = OPTConfig() 314 | 315 | >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config) 316 | ```""" 317 | 318 | model_type = "instructblip" 319 | is_composition = True 320 | 321 | def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): 322 | super().__init__(**kwargs) 323 | 324 | if vision_config is None: 325 | vision_config = {} 326 | logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.") 327 | 328 | if qformer_config is None: 329 | qformer_config = {} 330 | logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.") 331 | 332 | if text_config is None: 333 | text_config = {} 334 | logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") 335 | 336 | self.vision_config = InstructBlipVisionConfig(**vision_config) 337 | self.qformer_config = InstructBlipQFormerConfig(**qformer_config) 338 | text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" 339 | self.text_config = CONFIG_MAPPING[text_model_type](**text_config) 340 | 341 | self.tie_word_embeddings = self.text_config.tie_word_embeddings 342 | self.is_encoder_decoder = self.text_config.is_encoder_decoder 343 | 344 | self.num_query_tokens = num_query_tokens 345 | self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size 346 | self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 347 | self.initializer_factor = 1.0 348 | self.initializer_range = 0.02 349 | 350 | @classmethod 351 | def from_vision_qformer_text_configs( 352 | cls, 353 | vision_config: InstructBlipVisionConfig, 354 | qformer_config: InstructBlipQFormerConfig, 355 | text_config: PretrainedConfig, 356 | **kwargs, 357 | ): 358 | r""" 359 | Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and 360 | language model configurations. 361 | 362 | Returns: 363 | [`InstructBlipConfig`]: An instance of a configuration object 364 | """ 365 | 366 | return cls( 367 | vision_config=vision_config.to_dict(), 368 | qformer_config=qformer_config.to_dict(), 369 | text_config=text_config.to_dict(), 370 | **kwargs, 371 | ) 372 | 373 | def to_dict(self): 374 | """ 375 | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. 376 | 377 | Returns: 378 | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, 379 | """ 380 | output = copy.deepcopy(self.__dict__) 381 | output["vision_config"] = self.vision_config.to_dict() 382 | output["qformer_config"] = self.qformer_config.to_dict() 383 | output["text_config"] = self.text_config.to_dict() 384 | output["model_type"] = self.__class__.model_type 385 | return output 386 | -------------------------------------------------------------------------------- /model/instructblip/convert_instructblip_original_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Convert InstructBLIP checkpoints from the original repository. 17 | 18 | URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip 19 | """ 20 | 21 | import argparse 22 | 23 | import requests 24 | import torch 25 | 26 | # pip3 install salesforce-lavis 27 | # I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) 28 | # also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml 29 | # same for Vicuna-13b 30 | from lavis.models import load_model_and_preprocess 31 | from PIL import Image 32 | 33 | from transformers import ( 34 | AutoTokenizer, 35 | BlipImageProcessor, 36 | InstructBlipConfig, 37 | InstructBlipForConditionalGeneration, 38 | InstructBlipProcessor, 39 | InstructBlipQFormerConfig, 40 | InstructBlipVisionConfig, 41 | LlamaConfig, 42 | LlamaTokenizer, 43 | T5Config, 44 | T5TokenizerFast, 45 | ) 46 | from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD 47 | 48 | 49 | def load_demo_image(): 50 | url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" 51 | image = Image.open(requests.get(url, stream=True).raw).convert("RGB") 52 | 53 | return image 54 | 55 | 56 | # here we list all keys to be renamed (original name on the left, our name on the right) 57 | def create_rename_keys(config): 58 | rename_keys = [] 59 | # fmt: off 60 | 61 | # vision encoder 62 | rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) 63 | rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) 64 | rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) 65 | rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) 66 | rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) 67 | rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) 68 | 69 | for i in range(config.vision_config.num_hidden_layers): 70 | rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) 71 | rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) 72 | rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) 73 | rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) 74 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) 75 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) 76 | rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) 77 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) 78 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) 79 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) 80 | rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) 81 | 82 | # QFormer 83 | rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) 84 | rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) 85 | 86 | # fmt: on 87 | return rename_keys 88 | 89 | 90 | def rename_key(dct, old, new): 91 | val = dct.pop(old) 92 | dct[new] = val 93 | 94 | 95 | def read_in_q_v_bias(state_dict, config): 96 | for i in range(config.vision_config.num_hidden_layers): 97 | # read in original q and v biases 98 | q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") 99 | v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") 100 | 101 | # next, set bias in the state dict 102 | qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) 103 | state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias 104 | 105 | 106 | def get_blip2_config(model_name): 107 | image_size = 364 if "coco" in model_name else 224 108 | vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict() 109 | 110 | # make sure the models have proper bos_token_id and eos_token_id set (important for generation) 111 | # seems like flan-T5 models don't have bos_token_id properly set? 112 | if "t5-xl" in model_name: 113 | text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() 114 | elif "t5-xxl" in model_name: 115 | text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() 116 | elif "vicuna-7b" in model_name: 117 | text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() 118 | elif "vicuna-13b" in model_name: 119 | text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() 120 | else: 121 | raise ValueError("Model name not supported") 122 | 123 | # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 124 | qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict() 125 | config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config) 126 | 127 | return config, image_size 128 | 129 | 130 | @torch.no_grad() 131 | def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): 132 | """ 133 | Copy/paste/tweak model's weights to Transformers design. 134 | """ 135 | qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left") 136 | qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) 137 | 138 | if "t5" in model_name: 139 | tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") 140 | elif "vicuna" in model_name: 141 | tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") 142 | tokenizer.add_special_tokens({"pad_token": "[PAD]"}) 143 | tokenizer.add_special_tokens({"bos_token": ""}) 144 | tokenizer.add_special_tokens({"eos_token": ""}) 145 | tokenizer.add_special_tokens({"unk_token": ""}) 146 | 147 | config, image_size = get_blip2_config(model_name) 148 | hf_model = InstructBlipForConditionalGeneration(config).eval() 149 | 150 | model_name_to_original = { 151 | "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), 152 | "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), 153 | "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), 154 | "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), 155 | } 156 | 157 | name, type = model_name_to_original[model_name] 158 | 159 | # load original model 160 | print("Loading original model...") 161 | hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" 162 | lavis_device = "cuda:3" if torch.cuda.is_available() else "cpu" 163 | original_model, vis_processors, _ = load_model_and_preprocess( 164 | name=name, model_type=type, is_eval=True, device=lavis_device 165 | ) 166 | original_model.eval() 167 | print("Done!") 168 | 169 | # update state dict keys 170 | state_dict = original_model.state_dict() 171 | rename_keys = create_rename_keys(config) 172 | for src, dest in rename_keys: 173 | rename_key(state_dict, src, dest) 174 | 175 | # some keys can be renamed efficiently 176 | for key, val in state_dict.copy().items(): 177 | val = state_dict.pop(key) 178 | if key.startswith("Qformer.bert"): 179 | key = key.replace("Qformer.bert", "qformer") 180 | if "attention.self" in key: 181 | key = key.replace("self", "attention") 182 | if "llm_proj" in key: 183 | key = key.replace("llm_proj", "language_projection") 184 | if "t5_proj" in key: 185 | key = key.replace("t5_proj", "language_projection") 186 | if key.startswith("llm_model"): 187 | key = key.replace("llm_model", "language_model") 188 | if key.startswith("t5"): 189 | key = key.replace("t5", "language") 190 | state_dict[key] = val 191 | 192 | # read in qv biases 193 | read_in_q_v_bias(state_dict, config) 194 | 195 | # note: weights get loaded in torch.float32 by default 196 | hf_model.load_state_dict(state_dict, strict=True) 197 | 198 | image = load_demo_image() 199 | prompt = "What is unusual about this image?" 200 | 201 | # create processor 202 | image_processor = BlipImageProcessor( 203 | size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD 204 | ) 205 | processor = InstructBlipProcessor( 206 | image_processor=image_processor, 207 | tokenizer=tokenizer, 208 | qformer_tokenizer=qformer_tokenizer, 209 | ) 210 | inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) 211 | 212 | # make sure processor creates exact same pixel values 213 | original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) 214 | pixel_values = inputs.pixel_values 215 | assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) 216 | 217 | original_model.to(lavis_device) 218 | hf_model.to(hf_model_device) 219 | with torch.no_grad(): 220 | if "vicuna" in model_name: 221 | original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits 222 | logits = hf_model(**inputs).logits 223 | else: 224 | original_logits = original_model( 225 | {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} 226 | ).logits 227 | label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) 228 | labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) 229 | logits = hf_model(**inputs, labels=labels).logits 230 | 231 | print("First values of original logits:", original_logits[0, :3, :3]) 232 | print("First values of HF logits:", logits[0, :3, :3]) 233 | 234 | # assert values 235 | assert original_logits.shape == logits.shape 236 | atol = 1e-1 if "vicuna" in model_name else 1e-2 237 | assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) 238 | print("Looks ok!") 239 | 240 | print("Generating with original model...") 241 | original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) 242 | 243 | # important: we need to cast the weights of the HF model to the appropriate type 244 | print("Generating with HF model...") 245 | outputs = hf_model.generate( 246 | **inputs, 247 | do_sample=False, 248 | num_beams=5, 249 | max_length=256, 250 | min_length=1, 251 | top_p=0.9, 252 | repetition_penalty=1.5, 253 | length_penalty=1.0, 254 | temperature=1, 255 | ) 256 | if "vicuna" in model_name: 257 | # convert output id 0 to 2 (eos_token_id) 258 | # TODO add this in the generate method? 259 | outputs[outputs == 0] = 2 260 | print("Original generation:", original_outputs) 261 | output_text = processor.batch_decode(outputs, skip_special_tokens=True) 262 | output_text = [text.strip() for text in output_text] 263 | print("HF generation:", output_text) 264 | 265 | if pytorch_dump_folder_path is not None: 266 | processor.save_pretrained(pytorch_dump_folder_path) 267 | hf_model.save_pretrained(pytorch_dump_folder_path) 268 | 269 | if push_to_hub: 270 | processor.push_to_hub(f"nielsr/{model_name}") 271 | hf_model.push_to_hub(f"nielsr/{model_name}") 272 | 273 | 274 | if __name__ == "__main__": 275 | parser = argparse.ArgumentParser() 276 | choices = [ 277 | "instructblip-vicuna-7b", 278 | "instructblip-vicuna-13b", 279 | "instructblip-flan-t5-xl", 280 | "instructblip-flan-t5-xxl", 281 | ] 282 | parser.add_argument( 283 | "--model_name", 284 | default="instructblip-flan-t5-xl", 285 | choices=choices, 286 | type=str, 287 | help="Path to hf config.json of model to convert", 288 | ) 289 | parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 290 | parser.add_argument( 291 | "--push_to_hub", 292 | action="store_true", 293 | help="Whether to push the model and processor to the hub after converting", 294 | ) 295 | 296 | args = parser.parse_args() 297 | 298 | convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) 299 | -------------------------------------------------------------------------------- /model/instructblip/processing_instructblip.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former. 17 | """ 18 | 19 | import os 20 | from typing import List, Optional, Union 21 | 22 | from transformers.processing_utils import ProcessorMixin 23 | from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy 24 | from transformers.utils import TensorType 25 | # from ..auto import AutoTokenizer 26 | from transformers.models.auto.configuration_auto import AutoConfig 27 | from transformers.models.auto.processing_auto import AutoTokenizer 28 | 29 | 30 | class InstructBlipProcessor(ProcessorMixin): 31 | r""" 32 | Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single 33 | processor. 34 | 35 | [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the 36 | docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. 37 | 38 | Args: 39 | image_processor (`BlipImageProcessor`): 40 | An instance of [`BlipImageProcessor`]. The image processor is a required input. 41 | tokenizer (`AutoTokenizer`): 42 | An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. 43 | """ 44 | attributes = ["image_processor", "tokenizer"] 45 | image_processor_class = "BlipImageProcessor" 46 | tokenizer_class = "AutoTokenizer" 47 | 48 | def __init__(self, image_processor, tokenizer, qformer_tokenizer): 49 | super().__init__(image_processor, tokenizer) 50 | self.current_processor = self.image_processor 51 | 52 | # add QFormer tokenizer 53 | self.qformer_tokenizer = qformer_tokenizer 54 | 55 | def __call__( 56 | self, 57 | images=None, 58 | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, 59 | add_special_tokens: bool = True, 60 | padding: Union[bool, str, PaddingStrategy] = False, 61 | truncation: Union[bool, str, TruncationStrategy] = None, 62 | max_length: Optional[int] = None, 63 | stride: int = 0, 64 | pad_to_multiple_of: Optional[int] = None, 65 | return_attention_mask: Optional[bool] = None, 66 | return_overflowing_tokens: bool = False, 67 | return_special_tokens_mask: bool = False, 68 | return_offsets_mapping: bool = False, 69 | return_token_type_ids: bool = False, 70 | return_length: bool = False, 71 | verbose: bool = True, 72 | return_tensors: Optional[Union[str, TensorType]] = None, 73 | **kwargs, 74 | ) -> BatchEncoding: 75 | """ 76 | This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and 77 | [`BertTokenizerFast.__call__`] to prepare text for the model. 78 | 79 | Please refer to the docstring of the above two methods for more information. 80 | """ 81 | if images is None and text is None: 82 | raise ValueError("You have to specify either images or text.") 83 | 84 | # Get only text 85 | if images is None: 86 | self.current_processor = self.tokenizer 87 | text_encoding = self.tokenizer( 88 | text=text, 89 | add_special_tokens=add_special_tokens, 90 | padding=padding, 91 | truncation=truncation, 92 | max_length=max_length, 93 | stride=stride, 94 | pad_to_multiple_of=pad_to_multiple_of, 95 | return_attention_mask=return_attention_mask, 96 | return_overflowing_tokens=return_overflowing_tokens, 97 | return_special_tokens_mask=return_special_tokens_mask, 98 | return_offsets_mapping=return_offsets_mapping, 99 | return_token_type_ids=return_token_type_ids, 100 | return_length=return_length, 101 | verbose=verbose, 102 | return_tensors=return_tensors, 103 | **kwargs, 104 | ) 105 | return text_encoding 106 | 107 | # add pixel_values 108 | encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) 109 | 110 | if text is not None: 111 | text_encoding = self.tokenizer( 112 | text=text, 113 | add_special_tokens=add_special_tokens, 114 | padding=padding, 115 | truncation=truncation, 116 | max_length=max_length, 117 | stride=stride, 118 | pad_to_multiple_of=pad_to_multiple_of, 119 | return_attention_mask=return_attention_mask, 120 | return_overflowing_tokens=return_overflowing_tokens, 121 | return_special_tokens_mask=return_special_tokens_mask, 122 | return_offsets_mapping=return_offsets_mapping, 123 | return_token_type_ids=return_token_type_ids, 124 | return_length=return_length, 125 | verbose=verbose, 126 | return_tensors=return_tensors, 127 | **kwargs, 128 | ) 129 | qformer_text_encoding = self.qformer_tokenizer( 130 | text=text, 131 | add_special_tokens=add_special_tokens, 132 | padding=padding, 133 | truncation=truncation, 134 | max_length=max_length, 135 | stride=stride, 136 | pad_to_multiple_of=pad_to_multiple_of, 137 | return_attention_mask=return_attention_mask, 138 | return_overflowing_tokens=return_overflowing_tokens, 139 | return_special_tokens_mask=return_special_tokens_mask, 140 | return_offsets_mapping=return_offsets_mapping, 141 | return_token_type_ids=return_token_type_ids, 142 | return_length=return_length, 143 | verbose=verbose, 144 | return_tensors=return_tensors, 145 | **kwargs, 146 | ) 147 | qformer_text_encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") 148 | qformer_text_encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") 149 | text_encoding.update(qformer_text_encoding) 150 | else: 151 | text_encoding = None 152 | 153 | if text_encoding is not None: 154 | encoding_image_processor.update(text_encoding) 155 | 156 | return encoding_image_processor 157 | 158 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer 159 | def batch_decode(self, *args, **kwargs): 160 | """ 161 | This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please 162 | refer to the docstring of this method for more information. 163 | """ 164 | return self.tokenizer.batch_decode(*args, **kwargs) 165 | 166 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer 167 | def decode(self, *args, **kwargs): 168 | """ 169 | This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer 170 | to the docstring of this method for more information. 171 | """ 172 | return self.tokenizer.decode(*args, **kwargs) 173 | 174 | @property 175 | # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names 176 | def model_input_names(self): 177 | tokenizer_input_names = self.tokenizer.model_input_names 178 | image_processor_input_names = self.image_processor.model_input_names 179 | return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) 180 | 181 | # overwrite to save the Q-Former tokenizer in a separate folder 182 | def save_pretrained(self, save_directory, **kwargs): 183 | if os.path.isfile(save_directory): 184 | raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") 185 | os.makedirs(save_directory, exist_ok=True) 186 | qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer") 187 | self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path) 188 | return super().save_pretrained(save_directory, **kwargs) 189 | 190 | # overwrite to load the Q-Former tokenizer from a separate folder 191 | @classmethod 192 | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): 193 | qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") 194 | args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) 195 | args.append(qformer_tokenizer) 196 | return cls(*args) 197 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from transformers import ( 4 | BertModel, 5 | RobertaModel, 6 | AlbertModel, 7 | DebertaV2Model, 8 | XLNetModel, 9 | DebertaV2Model, 10 | AutoConfig 11 | ) 12 | import torch 13 | 14 | from model.blip2.modeling_blip_2 import Blip2ForConditionalGeneration 15 | from model.instructblip.modeling_instructblip import InstructBlipForConditionalGeneration 16 | 17 | MODEL_CLASS = { 18 | "blip-2": Blip2ForConditionalGeneration, 19 | "instructblip": InstructBlipForConditionalGeneration, 20 | 21 | } 22 | 23 | 24 | def get_model(model_args, config: AutoConfig, fix_bert: bool = False): 25 | 26 | model_class = MODEL_CLASS[config.model_type] 27 | model = model_class.from_pretrained( 28 | model_args.model_name_or_path, 29 | config=config 30 | ) 31 | 32 | 33 | for param in model.parameters(): 34 | param.requires_grad = False 35 | 36 | for param in model.language_projection.parameters(): 37 | param.requires_grad = True 38 | 39 | if model_args.backbone_model == 'flan-t5': 40 | for block in model.language_model.encoder.block: 41 | block.layer[0].SelfAttention.q.weight.requires_grad=True 42 | block.layer[0].SelfAttention.v.requires_grad=True 43 | 44 | for block in model.language_model.decoder.block: 45 | block.layer[0].SelfAttention.q.weight.requires_grad=True 46 | block.layer[0].SelfAttention.v.requires_grad=True 47 | block.layer[1].EncDecAttention.q.requires_grad=True 48 | block.layer[1].EncDecAttention.v.requires_grad=True 49 | else:# vicuna 50 | print(f"vicuna layer:{len(model.language_model.model.layers)}") 51 | for block in model.language_model.model.layers: 52 | block.self_attn.q_proj.weight.requires_grad=True 53 | block.self_attn.v_proj.weight.requires_grad=True 54 | 55 | 56 | 57 | all_param = 0 58 | trained_param=0 59 | for _, param in model.named_parameters(): 60 | all_param += param.numel() 61 | if param.requires_grad ==True: 62 | trained_param+=param.numel() 63 | total_param = all_param 64 | 65 | print('***** total param is {} *****'.format(total_param)) 66 | print('***** total trained param is {} *****'.format(trained_param)) 67 | return model 68 | -------------------------------------------------------------------------------- /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/paper.pdf -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import json 5 | import numpy as np 6 | from typing import Dict 7 | 8 | import datasets 9 | import transformers 10 | import torch 11 | from transformers import set_seed, Trainer, TrainerCallback 12 | from transformers.trainer_utils import get_last_checkpoint 13 | from os.path import join 14 | from arguments import get_args 15 | from tasks.utils import * 16 | import warnings 17 | import time 18 | 19 | warnings.filterwarnings("ignore") 20 | logger = logging.getLogger(__name__) 21 | 22 | class ProfCallback(TrainerCallback): 23 | def __init__(self, prof): 24 | self.prof = prof 25 | 26 | def on_step_end(self, args, state, control, **kwargs): 27 | self.prof.step() 28 | 29 | def train(trainer, resume_from_checkpoint=None, last_checkpoint=None): 30 | checkpoint = None 31 | 32 | print("start training") 33 | 34 | 35 | if resume_from_checkpoint is not None: 36 | checkpoint = resume_from_checkpoint 37 | elif last_checkpoint is not None: 38 | checkpoint = last_checkpoint 39 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 40 | # trainer.save_model() 41 | # with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, 42 | # torch.profiler.ProfilerActivity.CUDA], 43 | # schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2), 44 | # on_trace_ready=torch.profiler.tensorboard_trace_handler('hf-training-trainer'), 45 | # profile_memory=True, 46 | # with_stack=True, 47 | # record_shapes=True) as prof: 48 | 49 | # trainer.add_callback(ProfCallback(prof=prof)) 50 | # train_result = trainer.train(resume_from_checkpoint=checkpoint) 51 | # trainer.save_model() 52 | 53 | metrics = train_result.metrics 54 | 55 | trainer.log_metrics("train", metrics) 56 | trainer.save_metrics("train", metrics) 57 | trainer.save_state() 58 | 59 | trainer.log_best_metrics() 60 | 61 | 62 | def evaluate(trainer): 63 | logger.info("*** Evaluate ***") 64 | 65 | eval_metrics = trainer.evaluate() 66 | if 'eval_BleuScore' in eval_metrics: 67 | eval_bleu = eval_metrics.pop('eval_BleuScore') 68 | trainer.log_metrics("eval", eval_metrics) 69 | trainer.save_metrics("eval", eval_metrics) 70 | 71 | test_metrics = trainer.evaluate(eval_dataset=trainer.predict_dataset, metric_key_prefix="test",) 72 | if 'test_BleuScore' in test_metrics: 73 | test_bleu = test_metrics.pop('test_BleuScore') 74 | trainer.log_metrics("test", test_metrics) 75 | trainer.save_metrics("test", test_metrics) 76 | 77 | 78 | def predict(trainer, predict_dataset=None): 79 | if predict_dataset is None: 80 | logger.info("No dataset is available for testing") 81 | 82 | elif isinstance(predict_dataset, dict): 83 | 84 | for dataset_name, d in predict_dataset.items(): 85 | logger.info("*** Predict: %s ***" % dataset_name) 86 | predictions, labels, metrics = trainer.predict( 87 | d, metric_key_prefix="predict" 88 | ) 89 | predictions = predictions.numpy() 90 | if 'test_BleuScore' in metrics: 91 | test_bleu = metrics.pop('test_BleuScore') 92 | trainer.log_metrics("predict", metrics) 93 | trainer.save_metrics("predict", metrics) 94 | 95 | else: 96 | logger.info("*** Predict ***") 97 | predictions, labels, metrics = trainer.predict( 98 | predict_dataset, metric_key_prefix="predict" 99 | ) 100 | if 'predict_BleuScore' in metrics: 101 | predict_bleu = metrics.pop('predict_BleuScore') 102 | trainer.log_metrics("predict", metrics) 103 | trainer.save_metrics("predict", metrics) 104 | 105 | with open(os.path.join('./checkpoints/', trainer.model_args.experiment_name, "predictions.json"), "w") as f: 106 | json.dump(predictions.tolist(), f, indent=4) 107 | with open(os.path.join('./checkpoints/', trainer.model_args.experiment_name, "labels.json"), "w") as f: 108 | json.dump(labels.tolist(), f, indent=4) 109 | if __name__ == "__main__": 110 | args = get_args() 111 | 112 | model_args, data_args, training_args = args 113 | # log_file = join(training_args.output_dir+"/log_test.txt") 114 | logging.basicConfig( 115 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 116 | datefmt="%m/%d/%Y %H:%M:%S", 117 | # handlers=[logging.StreamHandler(sys.stdout),logging.FileHandler(log_file)], 118 | handlers=[logging.StreamHandler(sys.stdout)], 119 | ) 120 | log_level = training_args.get_process_log_level() 121 | logger.setLevel(log_level) 122 | datasets.utils.logging.set_verbosity(log_level) 123 | transformers.utils.logging.set_verbosity(log_level) 124 | transformers.utils.logging.enable_default_handler() 125 | transformers.utils.logging.enable_explicit_format() 126 | 127 | # Log on each process the small summary: 128 | logger.warning( 129 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 130 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 131 | ) 132 | logger.info(f"Training/evaluation parameters {training_args}") 133 | 134 | if not os.path.isdir("checkpoints") or not os.path.exists("checkpoints"): 135 | os.mkdir("checkpoints") 136 | 137 | if data_args.dataset_name.lower() in ["flickr"]: 138 | from tasks.vqa.get_trainer import get_trainer 139 | else: 140 | raise NotImplementedError( 141 | "Task {} is not implemented. Please choose a task from: {}".format(data_args.dataset_name)) 142 | 143 | set_seed(training_args.seed) 144 | 145 | trainer, predict_dataset = get_trainer(args) 146 | 147 | last_checkpoint = None 148 | if ( 149 | os.path.isdir(training_args.output_dir) 150 | and training_args.do_train 151 | and not training_args.overwrite_output_dir 152 | ): 153 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 154 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 155 | raise ValueError( 156 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 157 | "Use --overwrite_output_dir to overcome." 158 | ) 159 | elif ( 160 | last_checkpoint is not None and training_args.resume_from_checkpoint is None 161 | ): 162 | logger.info( 163 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 164 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 165 | ) 166 | 167 | if training_args.do_train: 168 | train(trainer, training_args.resume_from_checkpoint, last_checkpoint) 169 | 170 | if training_args.do_eval: 171 | evaluate(trainer) 172 | 173 | if training_args.do_predict: 174 | predict(trainer, predict_dataset) 175 | 176 | -------------------------------------------------------------------------------- /run_script/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/run_script/.DS_Store -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_blip2_t5xl.sh: -------------------------------------------------------------------------------- 1 | 2 | export EXPERIMENT_NAME=BLIP_deepSpeed_t5xl 3 | export DATASET_NAME=flickr 4 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 5 | export MODEL_DIR=models/ 6 | export NCCL_P2P_LEVEL=NVL 7 | export MODEL_NAME=blip2-flan-t5-xl 8 | model_name_or_path=model/blip2-flan-t5-xl 9 | processor_path=model/blip2-flan-t5-xl 10 | # Calculate the number of GPUs 11 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 12 | num_gpus="${#GPU_IDS[@]}" 13 | 14 | echo "Number of GPUs available: $num_gpus" 15 | 16 | 17 | 18 | bs=12 19 | eval_bs=16 20 | lr=5e-5 21 | dropout=0.1 22 | epoch=3 23 | seed=1234 24 | do_train=True 25 | do_test=True 26 | do_valid=True 27 | master_port=29507 28 | backbone_model=flan-t5 29 | model_type=blip2 30 | data_dir=MIC_tool 31 | 32 | eval_steps=1000 33 | save_steps=1000 34 | 35 | deepspeed_config=config/deepspeed_config.json 36 | generation_max_length=128 37 | label_max_length=128 38 | max_seq_length=1152 39 | 40 | load_datatype=json 41 | DONE_PREPROCESS=False 42 | TRAIN_PREPROCESS=True 43 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 44 | # OR it will preprocess the data in the data colloctor fuction of the dataset 45 | # 926000 46 | train_data_size=1889000 47 | gradient_accumulation_steps=8 48 | image_place_holder='图' 49 | 50 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 51 | 52 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 53 | # then you need to set the number of max_steps and uncomment the last line 54 | echo "Max Step per GPU: $max_steps" 55 | 56 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 57 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 58 | --experiment_name ${EXPERIMENT_NAME} \ 59 | --dataset_name ${DATASET_NAME} \ 60 | --dataset_config_name None \ 61 | --max_seq_length ${max_seq_length} \ 62 | --overwrite_cache True \ 63 | --pad_to_max_length True \ 64 | --done_preprocess ${DONE_PREPROCESS} \ 65 | --training_preprocess ${TRAIN_PREPROCESS} \ 66 | --load_datatype ${load_datatype} \ 67 | --train_file ${data_dir}/MMICL_t5_json/train/train.jsonl \ 68 | --validation_file ${data_dir}/MMICL_t5_json/val/val.jsonl \ 69 | --test_file ${data_dir}/MMICL_t5_json/test/test.jsonl \ 70 | --do_train $do_train \ 71 | --do_eval $do_valid \ 72 | --do_predict $do_test \ 73 | --per_device_train_batch_size ${bs} \ 74 | --per_device_eval_batch_size ${eval_bs} \ 75 | --bf16 \ 76 | --bf16_full_eval \ 77 | --model_type $model_type \ 78 | --save_total_limit 2 \ 79 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 80 | --num_train_epochs ${epoch} \ 81 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 82 | --learning_rate ${lr} \ 83 | --weight_decay 0.0005 \ 84 | --seed ${seed} \ 85 | --warmup_ratio 0.2 \ 86 | --evaluation_strategy steps \ 87 | --eval_steps ${eval_steps} \ 88 | --remove_unused_columns False \ 89 | --model_name_or_path $model_name_or_path \ 90 | --processor_path $processor_path \ 91 | --use_fast_tokenizer True \ 92 | --model_revision main \ 93 | --eval_type val \ 94 | --generation_max_length ${generation_max_length} \ 95 | --label_max_length ${label_max_length} \ 96 | --max_eval_samples 800 \ 97 | --max_predict_samples 800 \ 98 | --using_instruct_qformer False \ 99 | --run_name instructblip-vicuna-13b \ 100 | --load_best_model_at_end True \ 101 | --metric_for_best_model accuracy \ 102 | --greater_is_better True \ 103 | --backbone_model $backbone_model \ 104 | --save_strategy steps \ 105 | --save_steps ${save_steps} \ 106 | --full_bf16_training True \ 107 | --dataloader_num_workers 64 \ 108 | --image_place_holder ${image_place_holder} \ 109 | --logging_steps 100 \ 110 | --data_dir ${data_dir} \ 111 | --overwrite_output_dir \ 112 | --deepspeed ${deepspeed_config} \ 113 | # --max_steps ${max_steps} \ 114 | 115 | -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_blip2_t5xxl.sh: -------------------------------------------------------------------------------- 1 | 2 | export EXPERIMENT_NAME=BLIP_deepSpeed_t5xxl 3 | export DATASET_NAME=flickr 4 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 5 | export MODEL_DIR=models/ 6 | export NCCL_P2P_LEVEL=NVL 7 | export MODEL_NAME=blip2-flan-t5-xxl 8 | model_name_or_path=model/blip2-flan-t5-xxl 9 | processor_path=model/blip2-flan-t5-xxl 10 | # Calculate the number of GPUs 11 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 12 | num_gpus="${#GPU_IDS[@]}" 13 | 14 | echo "Number of GPUs available: $num_gpus" 15 | 16 | 17 | 18 | bs=3 19 | eval_bs=4 20 | lr=1e-4 21 | dropout=0.1 22 | epoch=3 23 | seed=1234 24 | do_train=True 25 | do_test=True 26 | do_valid=True 27 | master_port=29507 28 | backbone_model=flan-t5 29 | model_type=blip2 30 | data_dir=MIC_tool 31 | 32 | eval_steps=1000 33 | save_steps=1000 34 | 35 | deepspeed_config=config/deepspeed_config.json 36 | generation_max_length=128 37 | label_max_length=128 38 | max_seq_length=1152 39 | 40 | load_datatype=json 41 | DONE_PREPROCESS=False 42 | TRAIN_PREPROCESS=True 43 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 44 | # OR it will preprocess the data in the data colloctor fuction of the dataset 45 | # 926000 46 | train_data_size=1889000 47 | gradient_accumulation_steps=8 48 | image_place_holder='图' 49 | 50 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 51 | 52 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 53 | # then you need to set the number of max_steps and uncomment the last line 54 | echo "Max Step per GPU: $max_steps" 55 | 56 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 57 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 58 | --experiment_name ${EXPERIMENT_NAME} \ 59 | --dataset_name ${DATASET_NAME} \ 60 | --dataset_config_name None \ 61 | --max_seq_length ${max_seq_length} \ 62 | --overwrite_cache True \ 63 | --pad_to_max_length True \ 64 | --done_preprocess ${DONE_PREPROCESS} \ 65 | --training_preprocess ${TRAIN_PREPROCESS} \ 66 | --load_datatype ${load_datatype} \ 67 | --train_file ${data_dir}/MMICL_t5_json/train/train.jsonl \ 68 | --validation_file ${data_dir}/MMICL_t5_json/val/val.jsonl \ 69 | --test_file ${data_dir}/MMICL_t5_json/test/test.jsonl \ 70 | --do_train $do_train \ 71 | --do_eval $do_valid \ 72 | --do_predict $do_test \ 73 | --per_device_train_batch_size ${bs} \ 74 | --per_device_eval_batch_size ${eval_bs} \ 75 | --bf16 \ 76 | --bf16_full_eval \ 77 | --model_type $model_type \ 78 | --save_total_limit 2 \ 79 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 80 | --num_train_epochs ${epoch} \ 81 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 82 | --learning_rate ${lr} \ 83 | --weight_decay 0.0005 \ 84 | --seed ${seed} \ 85 | --warmup_ratio 0.2 \ 86 | --evaluation_strategy steps \ 87 | --eval_steps ${eval_steps} \ 88 | --remove_unused_columns False \ 89 | --model_name_or_path $model_name_or_path \ 90 | --processor_path $processor_path \ 91 | --use_fast_tokenizer True \ 92 | --model_revision main \ 93 | --eval_type val \ 94 | --generation_max_length ${generation_max_length} \ 95 | --label_max_length ${label_max_length} \ 96 | --max_eval_samples 800 \ 97 | --max_predict_samples 800 \ 98 | --using_instruct_qformer False \ 99 | --run_name instructblip-vicuna-13b \ 100 | --load_best_model_at_end True \ 101 | --metric_for_best_model accuracy \ 102 | --greater_is_better True \ 103 | --backbone_model $backbone_model \ 104 | --save_strategy steps \ 105 | --save_steps ${save_steps} \ 106 | --full_bf16_training True \ 107 | --dataloader_num_workers 64 \ 108 | --image_place_holder ${image_place_holder} \ 109 | --logging_steps 100 \ 110 | --data_dir ${data_dir} \ 111 | --overwrite_output_dir \ 112 | --deepspeed ${deepspeed_config} \ 113 | # --max_steps ${max_steps} \ 114 | 115 | -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_instructblip_t5xl.sh: -------------------------------------------------------------------------------- 1 | 2 | export EXPERIMENT_NAME=instruct_BLIP_deepSpeed_t5xl 3 | export DATASET_NAME=flickr 4 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 5 | export MODEL_DIR=models/ 6 | export NCCL_P2P_LEVEL=NVL 7 | export MODEL_NAME=instructblip-flan-t5-xl 8 | model_name_or_path=model/instructblip-flan-t5-xl 9 | processor_path=model/instructblip-flan-t5-xl 10 | # Calculate the number of GPUs 11 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 12 | num_gpus="${#GPU_IDS[@]}" 13 | 14 | echo "Number of GPUs available: $num_gpus" 15 | 16 | 17 | 18 | bs=3 19 | eval_bs=4 20 | lr=1e-4 21 | dropout=0.1 22 | epoch=3 23 | seed=1234 24 | do_train=True 25 | do_test=True 26 | do_valid=True 27 | master_port=29507 28 | backbone_model=flan-t5 29 | model_type=instructblip 30 | data_dir=MIC_tool 31 | 32 | eval_steps=1000 33 | save_steps=1000 34 | 35 | deepspeed_config=config/deepspeed_config.json 36 | generation_max_length=128 37 | label_max_length=128 38 | max_seq_length=1152 39 | 40 | load_datatype=json 41 | DONE_PREPROCESS=False 42 | TRAIN_PREPROCESS=True 43 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 44 | # OR it will preprocess the data in the data colloctor fuction of the dataset 45 | # 926000 46 | train_data_size=1889000 47 | gradient_accumulation_steps=8 48 | image_place_holder='图' 49 | 50 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 51 | 52 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 53 | # then you need to set the number of max_steps and uncomment the last line 54 | echo "Max Step per GPU: $max_steps" 55 | 56 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 57 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 58 | --experiment_name ${EXPERIMENT_NAME} \ 59 | --dataset_name ${DATASET_NAME} \ 60 | --dataset_config_name None \ 61 | --max_seq_length ${max_seq_length} \ 62 | --overwrite_cache True \ 63 | --pad_to_max_length True \ 64 | --done_preprocess ${DONE_PREPROCESS} \ 65 | --training_preprocess ${TRAIN_PREPROCESS} \ 66 | --load_datatype ${load_datatype} \ 67 | --train_file ${data_dir}/MMICL_t5_json/train/train.jsonl \ 68 | --validation_file ${data_dir}/MMICL_t5_json/val/val.jsonl \ 69 | --test_file ${data_dir}/MMICL_t5_json/test/test.jsonl \ 70 | --do_train $do_train \ 71 | --do_eval $do_valid \ 72 | --do_predict $do_test \ 73 | --per_device_train_batch_size ${bs} \ 74 | --per_device_eval_batch_size ${eval_bs} \ 75 | --bf16 \ 76 | --bf16_full_eval \ 77 | --model_type $model_type \ 78 | --save_total_limit 2 \ 79 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 80 | --num_train_epochs ${epoch} \ 81 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 82 | --learning_rate ${lr} \ 83 | --weight_decay 0.0005 \ 84 | --seed ${seed} \ 85 | --warmup_ratio 0.2 \ 86 | --evaluation_strategy steps \ 87 | --eval_steps ${eval_steps} \ 88 | --remove_unused_columns False \ 89 | --model_name_or_path $model_name_or_path \ 90 | --processor_path $processor_path \ 91 | --use_fast_tokenizer True \ 92 | --model_revision main \ 93 | --eval_type val \ 94 | --generation_max_length ${generation_max_length} \ 95 | --label_max_length ${label_max_length} \ 96 | --max_eval_samples 800 \ 97 | --max_predict_samples 800 \ 98 | --using_instruct_qformer False \ 99 | --run_name instructblip-vicuna-13b \ 100 | --load_best_model_at_end True \ 101 | --metric_for_best_model accuracy \ 102 | --greater_is_better True \ 103 | --backbone_model $backbone_model \ 104 | --save_strategy steps \ 105 | --save_steps ${save_steps} \ 106 | --full_bf16_training True \ 107 | --dataloader_num_workers 64 \ 108 | --image_place_holder ${image_place_holder} \ 109 | --logging_steps 100 \ 110 | --data_dir ${data_dir} \ 111 | --overwrite_output_dir \ 112 | --deepspeed ${deepspeed_config} \ 113 | # --max_steps ${max_steps} \ 114 | 115 | -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_instructblip_t5xxl.sh: -------------------------------------------------------------------------------- 1 | 2 | export EXPERIMENT_NAME=instruct_BLIP_deepSpeed_t5xxl 3 | export DATASET_NAME=flickr 4 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 5 | export MODEL_DIR=models/ 6 | export NCCL_P2P_LEVEL=NVL 7 | export MODEL_NAME=instructblip-flan-t5-xxl 8 | model_name_or_path=model/instructblip-flan-t5-xxl 9 | processor_path=model/instructblip-flan-t5-xxl 10 | # Calculate the number of GPUs 11 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 12 | num_gpus="${#GPU_IDS[@]}" 13 | 14 | echo "Number of GPUs available: $num_gpus" 15 | 16 | 17 | 18 | bs=3 19 | eval_bs=4 20 | lr=1e-4 21 | dropout=0.1 22 | epoch=3 23 | seed=1234 24 | do_train=True 25 | do_test=True 26 | do_valid=True 27 | master_port=29507 28 | backbone_model=flan-t5 29 | model_type=instructblip 30 | data_dir=MIC_tool 31 | 32 | eval_steps=1000 33 | save_steps=1000 34 | 35 | deepspeed_config=config/deepspeed_config.json 36 | generation_max_length=128 37 | label_max_length=128 38 | max_seq_length=1152 39 | 40 | load_datatype=json 41 | DONE_PREPROCESS=False 42 | TRAIN_PREPROCESS=True 43 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 44 | # OR it will preprocess the data in the data colloctor fuction of the dataset 45 | # 926000 46 | train_data_size=1889000 47 | gradient_accumulation_steps=8 48 | image_place_holder='图' 49 | 50 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 51 | 52 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 53 | # then you need to set the number of max_steps and uncomment the last line 54 | echo "Max Step per GPU: $max_steps" 55 | 56 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 57 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 58 | --experiment_name ${EXPERIMENT_NAME} \ 59 | --dataset_name ${DATASET_NAME} \ 60 | --dataset_config_name None \ 61 | --max_seq_length ${max_seq_length} \ 62 | --overwrite_cache True \ 63 | --pad_to_max_length True \ 64 | --done_preprocess ${DONE_PREPROCESS} \ 65 | --training_preprocess ${TRAIN_PREPROCESS} \ 66 | --load_datatype ${load_datatype} \ 67 | --train_file ${data_dir}/MMICL_t5_json/train/train.jsonl \ 68 | --validation_file ${data_dir}/MMICL_t5_json/val/val.jsonl \ 69 | --test_file ${data_dir}/MMICL_t5_json/test/test.jsonl \ 70 | --do_train $do_train \ 71 | --do_eval $do_valid \ 72 | --do_predict $do_test \ 73 | --per_device_train_batch_size ${bs} \ 74 | --per_device_eval_batch_size ${eval_bs} \ 75 | --bf16 \ 76 | --bf16_full_eval \ 77 | --model_type $model_type \ 78 | --save_total_limit 2 \ 79 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 80 | --num_train_epochs ${epoch} \ 81 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 82 | --learning_rate ${lr} \ 83 | --weight_decay 0.0005 \ 84 | --seed ${seed} \ 85 | --warmup_ratio 0.2 \ 86 | --evaluation_strategy steps \ 87 | --eval_steps ${eval_steps} \ 88 | --remove_unused_columns False \ 89 | --model_name_or_path $model_name_or_path \ 90 | --processor_path $processor_path \ 91 | --use_fast_tokenizer True \ 92 | --model_revision main \ 93 | --eval_type val \ 94 | --generation_max_length ${generation_max_length} \ 95 | --label_max_length ${label_max_length} \ 96 | --max_eval_samples 800 \ 97 | --max_predict_samples 800 \ 98 | --using_instruct_qformer False \ 99 | --run_name instructblip-vicuna-13b \ 100 | --load_best_model_at_end True \ 101 | --metric_for_best_model accuracy \ 102 | --greater_is_better True \ 103 | --backbone_model $backbone_model \ 104 | --save_strategy steps \ 105 | --save_steps ${save_steps} \ 106 | --full_bf16_training True \ 107 | --dataloader_num_workers 64 \ 108 | --image_place_holder ${image_place_holder} \ 109 | --logging_steps 100 \ 110 | --data_dir ${data_dir} \ 111 | --overwrite_output_dir \ 112 | --deepspeed ${deepspeed_config} \ 113 | # --max_steps ${max_steps} \ 114 | 115 | -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_instructblip_vicuna13.sh: -------------------------------------------------------------------------------- 1 | export EXPERIMENT_NAME=instruct_BLIP2_deepSpeed_vicuna13b 2 | export DATASET_NAME=flickr 3 | export MODEL_DIR=models/ 4 | export NCCL_P2P_LEVEL=NVL 5 | export MODEL_NAME=instructblip-vicuna-13b 6 | model_name_or_path=instructblip-vicuna-13b 7 | processor_path=instructblip-vicuna-13b 8 | # Calculate the number of GPUs 9 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 10 | num_gpus="${#GPU_IDS[@]}" 11 | 12 | echo "Number of GPUs available: $num_gpus" 13 | 14 | 15 | 16 | bs=1 17 | eval_bs=2 18 | lr=5e-5 19 | dropout=0.1 20 | epoch=2 21 | seed=2048 22 | do_train=True 23 | do_test=True 24 | do_valid=True 25 | master_port=29507 26 | backbone_model=vicuna 27 | model_type=instructblip 28 | data_dir=MIC_tool 29 | 30 | eval_steps=1000 31 | save_steps=1000 32 | 33 | deepspeed_config=config/deepspeed_config.json 34 | generation_max_length=128 35 | label_max_length=128 36 | max_seq_length=1152 37 | 38 | load_datatype=json 39 | DONE_PREPROCESS=False 40 | TRAIN_PREPROCESS=True 41 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 42 | # OR it will preprocess the data in the data colloctor fuction of the dataset 43 | # 926000 44 | train_data_size=1889000 45 | gradient_accumulation_steps=8 46 | image_place_holder='' 47 | 48 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 49 | 50 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 51 | # then you need to set the number of max_steps and uncomment the last line 52 | echo "Max Step per GPU: $max_steps" 53 | 54 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 55 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 56 | --experiment_name ${EXPERIMENT_NAME} \ 57 | --dataset_name ${DATASET_NAME} \ 58 | --dataset_config_name None \ 59 | --max_seq_length ${max_seq_length} \ 60 | --overwrite_cache True \ 61 | --pad_to_max_length True \ 62 | --done_preprocess ${DONE_PREPROCESS} \ 63 | --training_preprocess ${TRAIN_PREPROCESS} \ 64 | --load_datatype ${load_datatype} \ 65 | --train_file ${data_dir}/MMICL_vicuna_json/train/train.jsonl \ 66 | --validation_file ${data_dir}/MMICL_vicuna_json/val/val.jsonl \ 67 | --test_file ${data_dir}/MMICL_vicuna_json/test/test.jsonl \ 68 | --do_train $do_train \ 69 | --do_eval $do_valid \ 70 | --do_predict $do_test \ 71 | --per_device_train_batch_size ${bs} \ 72 | --per_device_eval_batch_size ${eval_bs} \ 73 | --bf16 \ 74 | --bf16_full_eval \ 75 | --model_type $model_type \ 76 | --save_total_limit 2 \ 77 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 78 | --num_train_epochs ${epoch} \ 79 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 80 | --learning_rate ${lr} \ 81 | --weight_decay 0.0005 \ 82 | --seed ${seed} \ 83 | --warmup_ratio 0.2 \ 84 | --evaluation_strategy steps \ 85 | --eval_steps ${eval_steps} \ 86 | --remove_unused_columns False \ 87 | --model_name_or_path $model_name_or_path \ 88 | --processor_path $processor_path \ 89 | --use_fast_tokenizer True \ 90 | --model_revision main \ 91 | --eval_type val \ 92 | --generation_max_length ${generation_max_length} \ 93 | --label_max_length ${label_max_length} \ 94 | --max_eval_samples 800 \ 95 | --max_predict_samples 800 \ 96 | --using_instruct_qformer False \ 97 | --run_name instructblip-vicuna-13b \ 98 | --load_best_model_at_end True \ 99 | --metric_for_best_model accuracy \ 100 | --greater_is_better True \ 101 | --backbone_model $backbone_model \ 102 | --save_strategy steps \ 103 | --save_steps ${save_steps} \ 104 | --full_bf16_training True \ 105 | --dataloader_num_workers 64 \ 106 | --image_place_holder ${image_place_holder} \ 107 | --logging_steps 100 \ 108 | --data_dir ${data_dir} \ 109 | --overwrite_output_dir \ 110 | --deepspeed ${deepspeed_config} \ 111 | # --max_steps ${max_steps} \ 112 | 113 | -------------------------------------------------------------------------------- /run_script/flickr/deep_speed_instructblip_vicuna7b.sh: -------------------------------------------------------------------------------- 1 | export EXPERIMENT_NAME=instruct_BLIP2_deepSpeed_vicuna7b 2 | export DATASET_NAME=flickr 3 | export MODEL_DIR=models/ 4 | export NCCL_P2P_LEVEL=NVL 5 | export MODEL_NAME=instructblip-vicuna-13b 6 | model_name_or_path=model/instructblip-vicuna-7b 7 | processor_path=model/instructblip-vicuna-7b 8 | # Calculate the number of GPUs 9 | IFS=',' read -ra GPU_IDS <<< "$CUDA_VISIBLE_DEVICES" 10 | num_gpus="${#GPU_IDS[@]}" 11 | 12 | echo "Number of GPUs available: $num_gpus" 13 | 14 | 15 | 16 | bs=1 17 | eval_bs=2 18 | lr=5e-5 19 | dropout=0.1 20 | epoch=2 21 | seed=2048 22 | do_train=True 23 | do_test=True 24 | do_valid=True 25 | master_port=29507 26 | backbone_model=vicuna 27 | model_type=instructblip 28 | data_dir=MIC_tool 29 | 30 | eval_steps=1000 31 | save_steps=1000 32 | 33 | deepspeed_config=config/deepspeed_config.json 34 | generation_max_length=128 35 | label_max_length=128 36 | max_seq_length=1152 37 | 38 | load_datatype=json 39 | DONE_PREPROCESS=False 40 | TRAIN_PREPROCESS=True 41 | # Set TRAIN_PREPROCESS to be False if you want to use the streaming preprocess of huggingface dataset; 42 | # OR it will preprocess the data in the data colloctor fuction of the dataset 43 | # 926000 44 | train_data_size=1889000 45 | gradient_accumulation_steps=8 46 | image_place_holder='' 47 | 48 | max_steps=$((($epoch * $train_data_size) / ($bs * $gradient_accumulation_steps * $num_gpus))) 49 | 50 | # if set train_preprocess to be False and done_preprocess to be False, to enable the streaminig preprocess of huggingface dataset 51 | # then you need to set the number of max_steps and uncomment the last line 52 | echo "Max Step per GPU: $max_steps" 53 | 54 | # python -m debugpy --wait-for-client --listen 5679 run.py \ 55 | deepspeed --master_port $master_port --num_gpus $num_gpus run.py \ 56 | --experiment_name ${EXPERIMENT_NAME} \ 57 | --dataset_name ${DATASET_NAME} \ 58 | --dataset_config_name None \ 59 | --max_seq_length ${max_seq_length} \ 60 | --overwrite_cache True \ 61 | --pad_to_max_length True \ 62 | --done_preprocess ${DONE_PREPROCESS} \ 63 | --training_preprocess ${TRAIN_PREPROCESS} \ 64 | --load_datatype ${load_datatype} \ 65 | --train_file ${data_dir}/MMICL_vicuna_json/train/train.jsonl \ 66 | --validation_file ${data_dir}/MMICL_vicuna_json/val/val.jsonl \ 67 | --test_file ${data_dir}/MMICL_vicuna_json/test/test.jsonl \ 68 | --do_train $do_train \ 69 | --do_eval $do_valid \ 70 | --do_predict $do_test \ 71 | --per_device_train_batch_size ${bs} \ 72 | --per_device_eval_batch_size ${eval_bs} \ 73 | --bf16 \ 74 | --bf16_full_eval \ 75 | --model_type $model_type \ 76 | --save_total_limit 2 \ 77 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 78 | --num_train_epochs ${epoch} \ 79 | --output_dir checkpoints/${EXPERIMENT_NAME} \ 80 | --learning_rate ${lr} \ 81 | --weight_decay 0.0005 \ 82 | --seed ${seed} \ 83 | --warmup_ratio 0.2 \ 84 | --evaluation_strategy steps \ 85 | --eval_steps ${eval_steps} \ 86 | --remove_unused_columns False \ 87 | --model_name_or_path $model_name_or_path \ 88 | --processor_path $processor_path \ 89 | --use_fast_tokenizer True \ 90 | --model_revision main \ 91 | --eval_type val \ 92 | --generation_max_length ${generation_max_length} \ 93 | --label_max_length ${label_max_length} \ 94 | --max_eval_samples 800 \ 95 | --max_predict_samples 800 \ 96 | --using_instruct_qformer False \ 97 | --run_name instructblip-vicuna-13b \ 98 | --load_best_model_at_end True \ 99 | --metric_for_best_model accuracy \ 100 | --greater_is_better True \ 101 | --backbone_model $backbone_model \ 102 | --save_strategy steps \ 103 | --save_steps ${save_steps} \ 104 | --full_bf16_training True \ 105 | --dataloader_num_workers 64 \ 106 | --image_place_holder ${image_place_holder} \ 107 | --logging_steps 100 \ 108 | --data_dir ${data_dir} \ 109 | --overwrite_output_dir \ 110 | --deepspeed ${deepspeed_config} \ 111 | # --max_steps ${max_steps} \ 112 | 113 | -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | 5 | from glob import glob 6 | 7 | from tasks.utils import * 8 | 9 | TASK = sys.argv[1] 10 | MODEL = sys.argv[2] 11 | 12 | if len(sys.argv) == 4: 13 | METRIC = sys.argv[3] 14 | elif TASK in GLUE_DATASETS + SUPERGLUE_DATASETS: 15 | METRIC = "accuracy" 16 | elif TASK in NER_DATASETS + SRL_DATASETS + QA_DATASETS: 17 | METRIC = "f1" 18 | 19 | best_score = 0 20 | 21 | files = glob(f"./checkpoints/{TASK}-{MODEL}-search/*/best_results.json") 22 | 23 | for f in files: 24 | metrics = json.load(open(f, 'r')) 25 | if metrics["best_eval_"+METRIC] > best_score: 26 | best_score = metrics["best_eval_"+METRIC] 27 | best_metrics = metrics 28 | best_file_name = f 29 | 30 | print(f"best_{METRIC}: {best_score}") 31 | print(f"best_metrics: {best_metrics}") 32 | print(f"best_file: {best_file_name}") 33 | 34 | -------------------------------------------------------------------------------- /tasks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HaozheZhao/MIC/8e1f4ecd57ee30a4c4e4b89b02ba4e77a09fe069/tasks/.DS_Store -------------------------------------------------------------------------------- /tasks/utils.py: -------------------------------------------------------------------------------- 1 | 2 | VQA_TASKS = ["flickr"] 3 | 4 | DATASETS = VQA_TASKS 5 | 6 | 7 | USE_FAST = { 8 | 'blip2': True, 9 | 'instructblip': True 10 | } -------------------------------------------------------------------------------- /tasks/vqa/get_trainer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | from transformers import AdamW 4 | 5 | from .dataset import FlickrDataset 6 | from model.utils import get_model 7 | from training.trainer_blip2 import BLIP2Trainer 8 | from training.trainer_instructblip2 import InstructBLIP2Trainer 9 | from model.instructblip.processing_instructblip import InstructBlipProcessor 10 | from model.instructblip.configuration_instructblip import InstructBlipConfig 11 | 12 | from accelerate import Accelerator, DistributedType 13 | from model.blip2.processing_blip_2 import Blip2Processor 14 | from model.blip2.configuration_blip_2 import Blip2Config 15 | import torch 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def get_trainer(args): 20 | model_args, data_args, training_args = args 21 | 22 | log_level = training_args.get_process_log_level() 23 | logger.setLevel(log_level) 24 | model_type = model_args.model_type 25 | backbone_model = model_args.backbone_model 26 | if model_type == 'blip2': 27 | config = Blip2Config.from_pretrained( 28 | model_args.model_name_or_path 29 | ) 30 | elif model_type == 'instructblip': 31 | config = InstructBlipConfig.from_pretrained( 32 | model_args.model_name_or_path 33 | ) 34 | if backbone_model == 'vicuna': 35 | config.text_config.pad_token_id = 32000 36 | config.text_config.max_sequence_length = data_args.max_seq_length 37 | model = get_model(model_args, config) 38 | if training_args.full_bf16_training: 39 | model = model.to(dtype=torch.bfloat16) 40 | 41 | if model_args.image_place_holder is not None: 42 | image_place_holder = model_args.image_place_holder 43 | else: 44 | image_place_holder = "图" if model_args.backbone_model == 'flan-t5' else "" 45 | print(f"image_place_holder: {image_place_holder}") 46 | processor_path = model_args.processor_path if model_args.processor_path is not None else model_args.model_name_or_path 47 | if model_type == 'blip2': 48 | processor = Blip2Processor.from_pretrained( 49 | processor_path, 50 | ) 51 | if backbone_model == 'flan-t5': 52 | sp = [image_place_holder]+[f"" for i in range(20)] 53 | sp = sp+processor.tokenizer.additional_special_tokens[len(sp):] 54 | processor.tokenizer.add_special_tokens({'additional_special_tokens':sp}) 55 | else: # opt 56 | sp = [image_place_holder]+[f"" for i in range(20)] 57 | processor.tokenizer.add_special_tokens({'additional_special_tokens':sp}) 58 | model.language_model.resize_token_embeddings(len(processor.tokenizer)) 59 | 60 | elif model_type == 'instructblip': 61 | processor = InstructBlipProcessor.from_pretrained( 62 | processor_path 63 | ) 64 | 65 | if backbone_model == 'flan-t5': 66 | sp = [image_place_holder]+[f"" for i in range(20)] 67 | sp = sp+processor.tokenizer.additional_special_tokens[len(sp):] 68 | processor.tokenizer.add_special_tokens({'additional_special_tokens':sp}) 69 | else: #vicuna 70 | sp = [image_place_holder]+[f"" for i in range(20)] 71 | processor.tokenizer.add_special_tokens({'additional_special_tokens':sp}) 72 | processor.qformer_tokenizer.add_special_tokens({'additional_special_tokens':sp}) 73 | model.language_model.resize_token_embeddings(len(processor.tokenizer)) 74 | model.language_model.model.embed_tokens.weight.requires_grad=True 75 | # bert tokenizer for q-former 76 | processor.qformer_tokenizer.add_special_tokens({'additional_special_tokens':sp}) 77 | if model.qformer.embeddings.word_embeddings.weight.shape[0] != len(processor.qformer_tokenizer): 78 | model.qformer.resize_token_embeddings(len(processor.qformer_tokenizer)) 79 | config.text_config._from_model_config =False 80 | 81 | dataset = FlickrDataset(processor, model_args, data_args, training_args, config) 82 | special_visual_token_id = dataset.special_visual_token_id 83 | model_args.special_visual_token_id = special_visual_token_id 84 | # if training_args.do_train: 85 | # for index in random.sample(range(len(dataset.train_dataset)), 1): 86 | # logger.info(f"Sample keys {index} of the training set: {dataset.train_dataset[index].keys()}.") 87 | # if not data_args.done_preprocess: 88 | # input_text = dataset.train_dataset[index]["input_text"] 89 | # logger.info(f"Sample input_text {index} of the training set: {input_text}.") 90 | # output_text = dataset.train_dataset[index]["output_text"] 91 | # logger.info(f"Sample output_text {index} of the training set: {output_text}.") 92 | 93 | # input_ids = dataset.train_dataset[index]["input_ids"] 94 | # logger.info(f"Sample input_ids {index} of the training set: {input_ids}.") 95 | # attention_mask = dataset.train_dataset[index]["attention_mask"] 96 | # logger.info(f"Sample attention_mask {index} of the training set: {attention_mask}.") 97 | # label = dataset.train_dataset[index]["label"] 98 | # logger.info(f"Sample label {index} of the training set: {label}.") 99 | 100 | if model_type == 'blip2': 101 | trainer = BLIP2Trainer( 102 | processor=processor, 103 | model=model, 104 | config=config, 105 | args=training_args, 106 | model_args=model_args, 107 | train_dataset=dataset.train_dataset if training_args.do_train else None, 108 | eval_dataset=dataset.eval_dataset if training_args.do_eval else None, 109 | predict_dataset=dataset.predict_dataset, 110 | compute_metrics=dataset.compute_metrics, 111 | data_collator=dataset.data_collator, 112 | ) 113 | elif model_type == 'instructblip': 114 | trainer = InstructBLIP2Trainer( 115 | processor=processor, 116 | model=model, 117 | config=config, 118 | args=training_args, 119 | model_args=model_args, 120 | train_dataset=dataset.train_dataset if training_args.do_train else None, 121 | eval_dataset=dataset.eval_dataset if training_args.do_eval else None, 122 | predict_dataset=dataset.predict_dataset, 123 | compute_metrics=dataset.compute_metrics, 124 | data_collator=dataset.data_collator, 125 | ) 126 | 127 | return trainer, dataset.predict_dataset 128 | -------------------------------------------------------------------------------- /training/trainer_base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict, OrderedDict 4 | 5 | from transformers import Trainer 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | _default_log_level = logging.INFO 10 | logger.setLevel(_default_log_level) 11 | 12 | class BaseTrainer(Trainer): 13 | def __init__(self, *args, predict_dataset = None, test_key = "accuracy", **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.predict_dataset = predict_dataset 16 | self.test_key = test_key 17 | self.best_metrics = OrderedDict({ 18 | "best_epoch": 0, 19 | f"best_eval_{self.test_key}": 0, 20 | }) 21 | 22 | def log_best_metrics(self): 23 | self.log_metrics("best", self.best_metrics) 24 | self.save_metrics("best", self.best_metrics, combined=False) 25 | 26 | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): 27 | if self.control.should_log: 28 | logs: Dict[str, float] = {} 29 | 30 | 31 | tr_loss_scalar = self._nested_gather(tr_loss).mean().item() 32 | 33 | # reset tr_loss to zero 34 | tr_loss -= tr_loss 35 | 36 | logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) 37 | logs["learning_rate"] = self._get_learning_rate() 38 | 39 | self._total_loss_scalar += tr_loss_scalar 40 | self._globalstep_last_logged = self.state.global_step 41 | self.store_flos() 42 | 43 | self.log(logs) 44 | 45 | eval_metrics = None 46 | if self.control.should_evaluate: 47 | eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 48 | self._report_to_hp_search(trial, epoch, eval_metrics) 49 | 50 | if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]: 51 | self.best_metrics["best_epoch"] = epoch 52 | self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key] 53 | 54 | if self.predict_dataset is not None: 55 | if isinstance(self.predict_dataset, dict): 56 | for dataset_name, dataset in self.predict_dataset.items(): 57 | _, _, test_metrics = self.predict(dataset, metric_key_prefix="test") 58 | self.best_metrics[f"best_test_{dataset_name}_{self.test_key}"] = test_metrics["test_"+self.test_key] 59 | else: 60 | _, _, test_metrics = self.predict(self.predict_dataset, metric_key_prefix="test") 61 | self.best_metrics["best_test_"+self.test_key] = test_metrics["test_"+self.test_key] 62 | 63 | logger.info(f"***** Epoch {epoch}: Best results *****") 64 | for key, value in self.best_metrics.items(): 65 | logger.info(f"{key} = {value}") 66 | self.log(self.best_metrics) 67 | 68 | if self.control.should_save: 69 | self._save_checkpoint(model, trial, metrics=eval_metrics) 70 | self.control = self.callback_handler.on_save(self.args, self.state, self.control) 71 | 72 | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: 73 | """ 74 | Perform a training step on a batch of inputs. 75 | 76 | Subclass and override to inject custom behavior. 77 | 78 | Args: 79 | model (`nn.Module`): 80 | The model to train. 81 | inputs (`Dict[str, Union[torch.Tensor, Any]]`): 82 | The inputs and targets of the model. 83 | 84 | The dictionary will be unpacked before being fed to the model. Most models expect the targets under the 85 | argument `labels`. Check your model's documentation for all accepted arguments. 86 | 87 | Return: 88 | `torch.Tensor`: The tensor with training loss on this batch. 89 | """ 90 | model.train() 91 | _, input_models = self._prepare_inputs(inputs, step="train") 92 | 93 | if is_sagemaker_mp_enabled(): 94 | scaler = self.scaler if self.do_grad_scaling else None 95 | loss_mb = smp_forward_backward(model, input_models, self.args.gradient_accumulation_steps, scaler=scaler) 96 | return loss_mb.reduce_mean().detach().to(self.args.device) 97 | 98 | with self.autocast_smart_context_manager(): 99 | if self.model_args.data_augmentation == "none": 100 | loss, outputs = self.compute_loss(model, input_models, return_outputs=True, step="train") 101 | if self.data_args.dataset_name in ["copa", "record"]: 102 | loss = self.task_helper.logits2loss(inputs, outputs) 103 | elif self.model_args.data_augmentation == "rdrop": 104 | loss1, outputs1 = self.compute_loss(model, input_models, return_outputs=True) 105 | if self.data_args.dataset_name in ["copa", "record"]: 106 | loss1 = self.task_helper.logits2loss(inputs, outputs1) 107 | loss2, outputs2 = self.compute_loss(model, input_models, return_outputs=True) 108 | if self.data_args.dataset_name in ["copa", "record"]: 109 | loss2 = self.task_helper.logits2loss(inputs, outputs2) 110 | bce_loss = 0.5 * loss1 + 0.5 * loss2 111 | kl_loss = compute_kl_loss(outputs1["logits"], outputs2["logits"]) 112 | loss = bce_loss + kl_loss 113 | # loss, outputs = self.compute_loss(model, input_models, return_outputs=True) 114 | # if self.data_args.dataset_name in ["copa", "record"]: 115 | # loss = self.task_helper.logits2loss(inputs, outputs) 116 | 117 | if self.args.n_gpu > 1: 118 | loss = loss.mean() # mean() to average on multi-gpu parallel training 119 | 120 | if self.args.gradient_accumulation_steps > 1 and not self.deepspeed: 121 | # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward` 122 | loss = loss / self.args.gradient_accumulation_steps 123 | 124 | if self.do_grad_scaling: 125 | self.scaler.scale(loss).backward() 126 | elif self.use_apex: 127 | with amp.scale_loss(loss, self.optimizer) as scaled_loss: 128 | scaled_loss.backward() 129 | elif self.deepspeed: 130 | # loss gets scaled under gradient_accumulation_steps in deepspeed 131 | loss = self.deepspeed.backward(loss) 132 | else: 133 | loss.backward() 134 | 135 | return loss.detach() -------------------------------------------------------------------------------- /training/trainer_blip2.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | from transformers import Trainer 4 | import matplotlib.pyplot as plt 5 | 6 | import time 7 | from typing import List, Optional, Union 8 | from transformers.trainer_utils import speed_metrics 9 | from transformers.debug_utils import DebugOption 10 | import math 11 | 12 | from torch.utils.tensorboard import SummaryWriter 13 | 14 | from transformers.file_utils import is_sagemaker_mp_enabled 15 | 16 | if is_sagemaker_mp_enabled(): 17 | from transformers.trainer_pt_utils import ( 18 | smp_forward_backward, 19 | smp_forward_only, 20 | smp_gather, 21 | smp_nested_concat, 22 | ) 23 | 24 | from typing import Dict, OrderedDict, Union, Any 25 | 26 | from typing import Any, Dict, List, Optional, Tuple, Union, OrderedDict 27 | 28 | import torch 29 | from torch import nn 30 | from torch.utils.data import Dataset 31 | 32 | from transformers.deepspeed import is_deepspeed_zero3_enabled 33 | from transformers.trainer_utils import PredictionOutput 34 | from transformers.utils import logging 35 | 36 | 37 | logger = logging.get_logger(__name__) 38 | 39 | 40 | class BLIP2Trainer(Trainer): 41 | 42 | def __init__( 43 | self, 44 | processor, 45 | config, 46 | model_args, 47 | predict_dataset=None, 48 | *args, 49 | **kwargs, 50 | ): 51 | 52 | super().__init__(*args, **kwargs) 53 | if 'args' in kwargs: 54 | self.training_args = kwargs['args'] 55 | 56 | self.test_key = "accuracy" 57 | self.processor = processor 58 | self.model_args = model_args 59 | self.best_metrics = OrderedDict( 60 | { 61 | "best_epoch": 0, 62 | f"best_eval_{self.test_key}": 0, 63 | } 64 | ) 65 | 66 | self.predict_dataset = predict_dataset 67 | self.epoch = 0 68 | 69 | self.config = config 70 | 71 | self.writer = SummaryWriter( 72 | f"./tensorboard_log/{self.model_args.experiment_name}" 73 | ) 74 | self.model_args = model_args 75 | 76 | self.best_metrics = OrderedDict({ 77 | "best_epoch": 0, 78 | f"best_eval_{self.test_key}": 0, 79 | }) 80 | if 'test_dataset' in kwargs: 81 | self.test_dataset = kwargs['test_dataset'] 82 | def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: 83 | """ 84 | Perform a training step on a batch of inputs. 85 | 86 | Subclass and override to inject custom behavior. 87 | 88 | Args: 89 | model (`nn.Module`): 90 | The model to train. 91 | inputs (`Dict[str, Union[torch.Tensor, Any]]`): 92 | The inputs and targets of the model. 93 | 94 | The dictionary will be unpacked before being fed to the model. Most models expect the targets under the 95 | argument `labels`. Check your model's documentation for all accepted arguments. 96 | 97 | Return: 98 | `torch.Tensor`: The tensor with training loss on this batch. 99 | """ 100 | model.train() 101 | inputs = self._prepare_inputs(inputs) 102 | 103 | if is_sagemaker_mp_enabled(): 104 | loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps) 105 | return loss_mb.reduce_mean().detach().to(self.args.device) 106 | 107 | with self.compute_loss_context_manager(): 108 | loss = self.compute_loss(model, inputs) 109 | 110 | if self.args.n_gpu > 1: 111 | loss = loss.mean() # mean() to average on multi-gpu parallel training 112 | 113 | if self.args.gradient_accumulation_steps > 1 and not self.deepspeed: 114 | # deepspeed handles loss scaling by gradient_accumulation_steps in its `backward` 115 | loss = loss / self.args.gradient_accumulation_steps 116 | 117 | if self.do_grad_scaling: 118 | self.scaler.scale(loss).backward() 119 | elif self.use_apex: 120 | with amp.scale_loss(loss, self.optimizer) as scaled_loss: 121 | scaled_loss.backward() 122 | elif self.deepspeed: 123 | # loss gets scaled under gradient_accumulation_steps in deepspeed 124 | loss = self.deepspeed.backward(loss) 125 | else: 126 | loss.backward() 127 | 128 | return loss.detach() 129 | 130 | def evaluate( 131 | self, 132 | eval_dataset: Optional[Dataset] = None, 133 | ignore_keys: Optional[List[str]] = None, 134 | metric_key_prefix: str = "eval", 135 | **gen_kwargs, 136 | ) -> Dict[str, float]: 137 | """ 138 | Run evaluation and returns metrics. 139 | 140 | The calling script will be responsible for providing a method to compute metrics, as they are task-dependent 141 | (pass it to the init `compute_metrics` argument). 142 | 143 | You can also subclass and override this method to inject custom behavior. 144 | 145 | Args: 146 | eval_dataset (`Dataset`, *optional*): 147 | Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns 148 | not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` 149 | method. 150 | ignore_keys (`List[str]`, *optional*): 151 | A list of keys in the output of your model (if it is a dictionary) that should be ignored when 152 | gathering predictions. 153 | metric_key_prefix (`str`, *optional*, defaults to `"eval"`): 154 | An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named 155 | "eval_bleu" if the prefix is `"eval"` (default) 156 | max_length (`int`, *optional*): 157 | The maximum target length to use when predicting with the generate method. 158 | num_beams (`int`, *optional*): 159 | Number of beams for beam search that will be used when predicting with the generate method. 1 means no 160 | beam search. 161 | gen_kwargs: 162 | Additional `generate` specific kwargs. 163 | 164 | Returns: 165 | A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The 166 | dictionary also contains the epoch number which comes from the training state. 167 | """ 168 | 169 | gen_kwargs = gen_kwargs.copy() 170 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 171 | gen_kwargs["max_length"] = self.args.generation_max_length 172 | if gen_kwargs.get("min_length") is None and gen_kwargs.get("max_min_tokens") is None: 173 | gen_kwargs["min_length"] = self.args.generation_min_length 174 | gen_kwargs["num_beams"] = ( 175 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams 176 | ) 177 | self._gen_kwargs = gen_kwargs 178 | 179 | return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) 180 | 181 | def predict( 182 | self, 183 | test_dataset: Dataset, 184 | ignore_keys: Optional[List[str]] = None, 185 | metric_key_prefix: str = "test", 186 | **gen_kwargs, 187 | ) -> PredictionOutput: 188 | """ 189 | Run prediction and returns predictions and potential metrics. 190 | 191 | Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method 192 | will also return metrics, like in `evaluate()`. 193 | 194 | Args: 195 | test_dataset (`Dataset`): 196 | Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the 197 | `model.forward()` method are automatically removed. Has to implement the method `__len__` 198 | ignore_keys (`List[str]`, *optional*): 199 | A list of keys in the output of your model (if it is a dictionary) that should be ignored when 200 | gathering predictions. 201 | metric_key_prefix (`str`, *optional*, defaults to `"eval"`): 202 | An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named 203 | "eval_bleu" if the prefix is `"eval"` (default) 204 | max_length (`int`, *optional*): 205 | The maximum target length to use when predicting with the generate method. 206 | num_beams (`int`, *optional*): 207 | Number of beams for beam search that will be used when predicting with the generate method. 1 means no 208 | beam search. 209 | gen_kwargs: 210 | Additional `generate` specific kwargs. 211 | 212 | 213 | 214 | If your predictions or labels have different sequence lengths (for instance because you're doing dynamic 215 | padding in a token classification task) the predictions will be padded (on the right) to allow for 216 | concatenation into one array. The padding index is -100. 217 | 218 | 219 | 220 | Returns: *NamedTuple* A namedtuple with the following keys: 221 | 222 | - predictions (`np.ndarray`): The predictions on `test_dataset`. 223 | - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). 224 | - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained 225 | labels). 226 | """ 227 | 228 | gen_kwargs = gen_kwargs.copy() 229 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 230 | gen_kwargs["max_length"] = self.args.generation_max_length 231 | if gen_kwargs.get("min_length") is None and gen_kwargs.get("max_min_tokens") is None: 232 | gen_kwargs["min_length"] = self.args.generation_min_length 233 | gen_kwargs["num_beams"] = ( 234 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams 235 | ) 236 | self._gen_kwargs = gen_kwargs 237 | 238 | return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) 239 | 240 | def prediction_step( 241 | self, 242 | model: nn.Module, 243 | inputs: Dict[str, Union[torch.Tensor, Any]], 244 | prediction_loss_only: bool, 245 | ignore_keys: Optional[List[str]] = None, 246 | ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: 247 | """ 248 | Perform an evaluation step on `model` using `inputs`. 249 | 250 | Subclass and override to inject custom behavior. 251 | 252 | Args: 253 | model (`nn.Module`): 254 | The model to evaluate. 255 | inputs (`Dict[str, Union[torch.Tensor, Any]]`): 256 | The inputs and targets of the model. 257 | 258 | The dictionary will be unpacked before being fed to the model. Most models expect the targets under the 259 | argument `labels`. Check your model's documentation for all accepted arguments. 260 | prediction_loss_only (`bool`): 261 | Whether or not to return the loss only. 262 | 263 | Return: 264 | Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and 265 | labels (each being optional). 266 | """ 267 | 268 | if not self.args.predict_with_generate or prediction_loss_only: 269 | return super().prediction_step( 270 | model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys 271 | ) 272 | 273 | has_labels = "labels" in inputs 274 | inputs = self._prepare_inputs(inputs) 275 | 276 | # XXX: adapt synced_gpus for fairscale as well 277 | gen_kwargs = self._gen_kwargs.copy() 278 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 279 | gen_kwargs["max_length"] = self.model.config.max_length 280 | gen_kwargs["num_beams"] = ( 281 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams 282 | ) 283 | default_synced_gpus = True if is_deepspeed_zero3_enabled() else False 284 | gen_kwargs["synced_gpus"] = ( 285 | gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus 286 | ) 287 | 288 | generated_tokens = self.model.generate( 289 | pixel_values = inputs['pixel_values'], 290 | input_ids = inputs['input_ids'], 291 | attention_mask = inputs['attention_mask'], 292 | img_mask = inputs['img_mask'], 293 | **gen_kwargs, 294 | ) 295 | # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop 296 | # TODO: remove this hack when the legacy code that initializes generation_config from a model config is 297 | # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 298 | # in case the batch is shorter than max length, the output should be padded 299 | # if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: 300 | # generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) 301 | # elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( 302 | # gen_kwargs["max_new_tokens"] + 1 303 | # ): 304 | # generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) 305 | 306 | with torch.no_grad(): 307 | if has_labels: 308 | with self.compute_loss_context_manager(): 309 | outputs = model(**inputs) 310 | if self.label_smoother is not None: 311 | loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() 312 | else: 313 | loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() 314 | else: 315 | loss = None 316 | 317 | if self.args.prediction_loss_only: 318 | return (loss, None, None) 319 | 320 | if has_labels: 321 | labels = inputs["labels"] 322 | # if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: 323 | # labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) 324 | # elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( 325 | # gen_kwargs["max_new_tokens"] + 1 326 | # ): 327 | # labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) 328 | else: 329 | labels = None 330 | 331 | return (loss, generated_tokens, labels) 332 | 333 | def _pad_tensors_to_max_len(self, tensor, max_length): 334 | if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): 335 | # If PAD token is not defined at least EOS token has to be defined 336 | pad_token_id = ( 337 | self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id 338 | ) 339 | else: 340 | if self.model.config.pad_token_id is not None: 341 | pad_token_id = self.model.config.pad_token_id 342 | else: 343 | raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") 344 | 345 | padded_tensor = pad_token_id * torch.ones( 346 | (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device 347 | ) 348 | padded_tensor[:, : tensor.shape[-1]] = tensor 349 | return padded_tensor 350 | 351 | def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): 352 | if self.control.should_log: 353 | 354 | logs: Dict[str, float] = {} 355 | 356 | # all_gather + mean() to get average loss over all processes 357 | tr_loss_scalar = self._nested_gather(tr_loss).mean().item() 358 | 359 | # reset tr_loss to zero 360 | tr_loss -= tr_loss 361 | 362 | logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) 363 | logs["learning_rate"] = self._get_learning_rate() 364 | 365 | self._total_loss_scalar += tr_loss_scalar 366 | self._globalstep_last_logged = self.state.global_step 367 | self.store_flos() 368 | 369 | self.log(logs) 370 | 371 | metrics = None 372 | if self.control.should_evaluate: 373 | if "train" in self.model_args.eval_type: 374 | logger.info(f"***** Running Evaluation for train dataset *****") 375 | metrics = self.evaluate( 376 | eval_dataset=self.train_dataset, 377 | ignore_keys=ignore_keys_for_eval, 378 | ) 379 | self._report_to_hp_search(trial, epoch, metrics) 380 | # if "test" in self.model_args.eval_type: 381 | # logger.info(f"***** Running Evaluation for test dataset *****") 382 | # metrics = self.evaluate(ignore_keys=ignore_keys_for_eval, eval_dataset=self.test_dataset) 383 | # self._report_to_hp_search(trial, epoch, metrics) 384 | # else: 385 | logger.info(f"***** Running Evaluation for eval dataset *****") 386 | eval_metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 387 | self._report_to_hp_search(trial, epoch, eval_metrics) 388 | 389 | if eval_metrics["eval_"+self.test_key] > self.best_metrics["best_eval_"+self.test_key]: 390 | self.best_metrics["best_epoch"] = epoch 391 | self.best_metrics["best_eval_"+self.test_key] = eval_metrics["eval_"+self.test_key] 392 | self.best_model = model 393 | # self._save_checkpoint(self.best_model , trial, metrics=self.best_metrics) 394 | 395 | logger.info(f"***** Epoch {epoch}: Best results *****") 396 | for key, value in self.best_metrics.items(): 397 | logger.info(f"{key} = {value}") 398 | self.log(self.best_metrics) 399 | 400 | if self.control.should_save: 401 | self._save_checkpoint(model, trial, metrics=metrics) 402 | self.control = self.callback_handler.on_save(self.args, self.state, self.control) 403 | 404 | 405 | def log_best_metrics(self): 406 | self.log_metrics("best", self.best_metrics) 407 | self.save_metrics("best", self.best_metrics, combined=False) --------------------------------------------------------------------------------