├── .gitignore ├── LICENSE ├── README.md ├── assets └── framework.png ├── finetune └── scripts │ ├── llava │ ├── finetune_task_lora.sh │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json │ ├── mplug_owl2 │ ├── finetune_lora.sh │ ├── zero2.json │ ├── zero3.json │ └── zero3_offload.json │ └── qwenvl │ ├── ds_config_zero2.json │ ├── ds_config_zero3.json │ ├── finetune.py │ ├── finetune_ds.sh │ ├── finetune_lora_ds.sh │ ├── finetune_lora_single_gpu.sh │ ├── finetune_qlora_ds.sh │ └── finetune_qlora_single_gpu.sh ├── flickr30k_pipeline.py ├── internvl_chat ├── README.md ├── eval │ └── run_internvl.py ├── internvl │ ├── conversation.py │ ├── dist_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── internlm2 │ │ │ ├── configuration_internlm2.py │ │ │ ├── modeling_internlm2.py │ │ │ ├── tokenization_internlm2.py │ │ │ └── tokenization_internlm2_fast.py │ │ ├── internvl_chat │ │ │ ├── __init__.py │ │ │ ├── configuration_intern_vit.py │ │ │ ├── configuration_internvl_chat.py │ │ │ ├── flash_attention.py │ │ │ ├── modeling_intern_vit.py │ │ │ └── modeling_internvl_chat.py │ │ └── phi3 │ │ │ ├── configuration_phi3.py │ │ │ └── modeling_phi3.py │ ├── patch │ │ ├── __init__.py │ │ ├── llama2_flash_attn_monkey_patch.py │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_rmsnorm_monkey_patch.py │ │ ├── pad_data_collator.py │ │ └── train_sampler_patch.py │ └── train │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── dataset.py │ │ ├── internvl_chat_finetune.py │ │ ├── internvl_chat_pretrain.py │ │ └── trainer_monkey_patch.py ├── pyproject.toml ├── shell │ ├── data │ │ ├── internvl_2_finetune_flickr30k_rerank.json │ │ ├── internvl_2_finetune_mmqa_qa.json │ │ ├── internvl_2_finetune_mmqa_rerank.json │ │ ├── internvl_2_finetune_mscoco_rerank.json │ │ ├── internvl_2_finetune_webqa_qa.json │ │ └── internvl_2_finetune_webqa_rerank.json │ ├── internvl1.2 │ │ ├── 2nd_finetune │ │ │ ├── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh │ │ │ └── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh │ │ └── hermes2_yi34b │ │ │ └── internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh │ ├── internvl1.5 │ │ ├── 2nd_finetune │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh │ │ │ ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh │ │ │ └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh │ │ ├── hermes2_yi34b │ │ │ ├── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh │ │ ├── internlm2_1_8b │ │ │ ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh │ │ ├── internlm2_20b │ │ │ ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh │ │ └── phi3_3_8b │ │ │ ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh │ │ │ └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh │ └── internvl2.0 │ │ └── 2nd_finetune │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh │ │ ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh │ │ ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh │ │ ├── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh │ │ └── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh ├── zero_stage1_config.json ├── zero_stage2_config.json ├── zero_stage3_config.json ├── zero_stage3_config_100b.json ├── zero_stage3_config_34b.json └── zero_stage3_config_70b.json ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ ├── table │ │ ├── answer │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ ├── answer_bard.jsonl │ │ │ ├── answer_gpt35.jsonl │ │ │ ├── answer_llama-13b.jsonl │ │ │ └── answer_vicuna-13b.jsonl │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ ├── model.jsonl │ │ ├── prompt.jsonl │ │ ├── question.jsonl │ │ ├── results │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ ├── review │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ ├── reviewer.jsonl │ │ └── rule.json │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ └── llava_mpt.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── merge_lora.py ├── mmqa_oracle.py ├── mmqa_pipeline.py ├── mplug_owl2 ├── __init__.py ├── constants.py ├── conversation.py ├── evaluate │ ├── EVALUATION.md │ ├── __init__.py │ ├── evaluate_caption.py │ ├── evaluate_mmbench.py │ ├── evaluate_mme.py │ ├── evaluate_mmmu.py │ ├── evaluate_vqa.py │ ├── mmbench_converter.py │ ├── run_mplug_owl2.py │ ├── vqa.py │ └── vqa_eval.py ├── local_serve │ ├── __init__.py │ ├── examples │ │ ├── Rebecca_(1939_poster)_Small.jpeg │ │ └── extreme_ironing.jpg │ ├── local_web_server.py │ └── model_worker.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── builder.py │ ├── configuration_mplug_owl2.py │ ├── configuration_qwen.py │ ├── convert_mplug_owl2_weight_to_hf.py │ ├── modeling_attn_mask_utils.py │ ├── modeling_llama2.py │ ├── modeling_mplug_owl2.py │ ├── modeling_qwen.py │ ├── multiway.py │ ├── utils.py │ └── visual_encoder.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── Rebecca_(1939_poster)_Small.jpeg │ │ └── extreme_ironing.jpg │ ├── gradio_web_server.py │ ├── model_worker.py │ └── register_workers.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── mplug_owl2_trainer.py │ ├── train.py │ └── train_mem.py └── utils.py ├── mscoco_pipeline.py ├── qwenvl └── run_qwenvl.py ├── requirements.txt ├── utils ├── FlagEmbedding │ ├── __init__.py │ └── visual │ │ ├── README.md │ │ ├── __init__.py │ │ ├── eva_clip │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── constants.py │ │ ├── eva_vit_model.py │ │ ├── factory.py │ │ ├── hf_configs.py │ │ ├── hf_model.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ └── EVA02-CLIP-bigE-14.json │ │ ├── modified_resnet.py │ │ ├── openai.py │ │ ├── pretrained.py │ │ ├── rope.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── transformer.py │ │ └── utils.py │ │ └── modeling.py ├── __init__.py ├── indexing_faiss.py ├── metrics.py ├── model_series.py └── utils.py ├── vcd_utils ├── vcd_add_noise.py └── vcd_sample.py ├── webqa_oracle.py └── webqa_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__ 3 | *.pyc 4 | *.egg-info 5 | dist 6 | 7 | # Log 8 | *.log 9 | *.log.* 10 | 11 | # Data 12 | *.tar.gz 13 | # Other 14 | .DS_Store 15 | wandb 16 | output 17 | checkpoints 18 | datasets 19 | finetune/tasks 20 | ckpts* 21 | 22 | .ipynb_checkpoints 23 | *.ipynb 24 | 25 | visualize.py 26 | attention_visualization 27 | *.pth 28 | logs 29 | test_chatgpt.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 IDEA-FinAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RagVL 2 | This is the official repo for the paper: ["MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training"](https://arxiv.org/pdf/2407.21439). 3 | 4 | ![image](https://github.com/IDEA-FinAI/RagVL/blob/main/assets/framework.png) 5 | 6 | ## Updates 7 | - [2024-09-20]: To better reflect the generality of our proposed method, we rename it to **RagVL**. 8 | - [2024-08-05]: Codes of RagVL (RagLLaVA) released. 9 | - [2024-07-31]: Paper of RagVL (RagLLaVA) online. 10 | 11 | ## Getting Started 12 | ### Environment Setup 13 | The required libraries for running RagVL can be found in `requirements.txt`. We recommend following [LLaVA](https://github.com/haotian-liu/LLaVA) to configure your environment. 14 | 15 | ### Data Preparation 16 | Before running RagVL, please: 17 | 18 | 1. Download from [Google Drive](https://drive.google.com/drive/folders/1wY18Vbrb8yDbFSg1Te-FQIs84AYYh48Z?usp=drive_link) for **datasets** and **checkpoints**. 19 | 20 | 2. Download from [WebQA](https://github.com/WebQnA/WebQA) and [MultimodalQA](https://github.com/allenai/multimodalqa) for **image files**. 21 | 22 | 3. Unzip the file. Place the `checkpoints/` and `datasets/` into `RagVL/`. 23 | 24 | 4. Place the `tasks/` into `RagVL/finetune/`. 25 | 26 | 5. Place the `MMQA_imgs/` and `train_img/` into `RagVL/finetune/tasks/`. 27 | 28 | 6. Place the `val_image/` into `RagVL/datasets/`. 29 | 30 | ## Training 31 | 1. Reranker 32 | 33 | | Models | Global Batch Size | Epochs | 34 | | --- | ---: | ---: | 35 | | LLaVA-v1.5-13B | 16 | 2 (WebQA) / 1 (others) | 36 | | Qwen-VL-Chat | 16 | 2 (WebQA) / 1 (others) | 37 | | mPLUG-Owl2 | 16 | 2 (WebQA) / 1 (others) | 38 | | InternVL2-1B | 16 | 1 | 39 | | InternVL2-2B | 16 | 1 | 40 | 41 | 2. Generator 42 | 43 | | Models | Global Batch Size | Epochs | 44 | | --- | ---: | ---: | 45 | | LLaVA-v1.5-13B | 16 | 2 (WebQA) / 3 (MMQA) | 46 | | InternVL2-1B | 16 | 1 | 47 | | InternVL2-2B | 16 | 1 | 48 | 49 | Except for the above two hyperparameters, the others follow the default settings from different models. 50 | 51 | To finetune LLaVA-v1.5-13B, Qwen-VL-Chat, and mPLUG-Owl2, find the corresponding finetune script in `RagVL/finetune/scripts/`. 52 | 53 | To finetune InternVL2-1B and InternVL2-2B, find the corresponding finetune script in `RagVL/internvl_chat/shell/internvl2.0/2nd_finetune`. 54 | 55 | ## Evaluation 56 | To evaluate RagVL on WebQA / MultimodalQA, you can employ the following command: 57 | 58 | ``` 59 | python webqa_pipeline.py \ # same arguments on mmqa_pipeline.py 60 | --reranker_model caption_lora \ # select the reranker 61 | --generator_model noise_injected_lora \ # select the generator 62 | --filter 0 \ # select the adaptive threshold 63 | --clip_topk 20 \ # we first retrieve 20 candidates by default 64 | ``` 65 | 66 | To evaluate the oracle settings on WebQA / MultimodalQA, you can employ the following command: 67 | 68 | ``` 69 | python webqa_oracle.py \ # same arguments on mmqa_oracle.py 70 | ``` 71 | 72 | ## Citation 73 | If you are interested or inspired by this work, you can cite us by: 74 | ```sh 75 | @article{chen2024mllm, 76 | title={MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training}, 77 | author={Chen, Zhanpeng and Xu, Chengjin and Qi, Yiyan and Guo, Jian}, 78 | journal={arXiv preprint arXiv:2407.21439}, 79 | year={2024} 80 | } 81 | ``` 82 | 83 | ## Related Projects 84 | - [LLaVA](https://github.com/haotian-liu/LLaVA): Large Language and Vision Assistant 85 | - [Qwen-VL](https://github.com/QwenLM/Qwen-VL): A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond 86 | - [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl): The Powerful Multi-modal Large Language Model Family 87 | - [InternVL](https://github.com/OpenGVLab/InternVL): A Pioneering Open-Source Alternative to GPT-4o 88 | - [Visualized BGE](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/visual): A universal multi-modal embedding model 89 | - [VCD](https://github.com/DAMO-NLP-SG/VCD): Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding 90 | - [CAL](https://github.com/foundation-multimodal-models/CAL): Prioritizing Visual Correlation by Contrastive Alignment 91 | 92 | 93 | -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/assets/framework.png -------------------------------------------------------------------------------- /finetune/scripts/llava/finetune_task_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed --include localhost:4,5,6,7 ../../../llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./zero3.json \ 6 | --model_name_or_path liuhaotian/llava-v1.5-13b \ 7 | --version v1 \ 8 | --data_path ../../tasks/Flickr30k_one_train_rerank.json \ 9 | --image_folder ../../tasks \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ../../../checkpoints/llava-v1.5-13b-2epoch-16batch_size-flickr30k-one-reranker-caption-lora \ 19 | --num_train_epochs 2 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-4 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4\ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | 39 | -------------------------------------------------------------------------------- /finetune/scripts/llava/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /finetune/scripts/llava/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /finetune/scripts/llava/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /finetune/scripts/mplug_owl2/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOAD='MAGAer13/mplug-owl2-llama2-7b' 4 | 5 | DATA_FILE=../../tasks/WebQA_train_QA_owl.json 6 | deepspeed --include localhost:4,5,6,7 ../../../mplug_owl2/train/train_mem.py \ 7 | --lora_enable True --lora_r 128 --lora_alpha 256 --visual_abstractor_lr 2e-5 \ 8 | --deepspeed ./zero3.json \ 9 | --model_name_or_path $LOAD \ 10 | --version v1 \ 11 | --data_path $DATA_FILE \ 12 | --image_folder ../../tasks \ 13 | --image_aspect_ratio pad \ 14 | --group_by_modality_length True \ 15 | --bf16 True \ 16 | --output_dir ../../../checkpoints/mplug-owl2-2epoch-8batch_size-webqa-noise-injected-lora \ 17 | --num_train_epochs 2 \ 18 | --per_device_train_batch_size 8 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 10 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-4 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --tune_visual_abstractor True \ 34 | --freeze_vision_model True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb -------------------------------------------------------------------------------- /finetune/scripts/mplug_owl2/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /finetune/scripts/mplug_owl2/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_param_persistence_threshold": "auto", 23 | "stage3_max_live_parameters": 0, 24 | "stage3_max_reuse_distance": 0, 25 | "stage3_prefetch_bucket_size": 0, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /finetune/scripts/mplug_owl2/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/ds_config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/ds_config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "none", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "stage3_gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "steps_per_print": 100, 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto", 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/finetune_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=8 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="path_to_data" 15 | 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | 24 | torchrun $DISTRIBUTED_ARGS finetune.py \ 25 | --model_name_or_path $MODEL \ 26 | --data_path $DATA \ 27 | --bf16 True \ 28 | --fix_vit True \ 29 | --output_dir output_qwen \ 30 | --num_train_epochs 5 \ 31 | --per_device_train_batch_size 1 \ 32 | --per_device_eval_batch_size 1 \ 33 | --gradient_accumulation_steps 16 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 1000 \ 37 | --save_total_limit 10 \ 38 | --learning_rate 1e-5 \ 39 | --weight_decay 0.1 \ 40 | --adam_beta2 0.95 \ 41 | --warmup_ratio 0.01 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --report_to "none" \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --deepspeed finetune/ds_config_zero3.json 49 | -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/finetune_lora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=7 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="../../tasks/WebQA_train_QA_qwenvl.json" 15 | 16 | DISTRIBUTED_ARGS=" 17 | --nproc_per_node $GPUS_PER_NODE \ 18 | --nnodes $NNODES \ 19 | --node_rank $NODE_RANK \ 20 | --master_addr $MASTER_ADDR \ 21 | --master_port $MASTER_PORT 22 | " 23 | 24 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 25 | 26 | torchrun $DISTRIBUTED_ARGS finetune.py \ 27 | --model_name_or_path $MODEL \ 28 | --data_path $DATA \ 29 | --bf16 True \ 30 | --fix_vit True \ 31 | --output_dir ../../../checkpoints/qwen-vl-chat-2epoch-2batch_size-webqa-noise-injected-lora-new \ 32 | --num_train_epochs 2 \ 33 | --per_device_train_batch_size 2 \ 34 | --per_device_eval_batch_size 1 \ 35 | --gradient_accumulation_steps 8 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 1000 \ 39 | --save_total_limit 10 \ 40 | --learning_rate 1e-5 \ 41 | --weight_decay 0.1 \ 42 | --adam_beta2 0.95 \ 43 | --warmup_ratio 0.01 \ 44 | --lr_scheduler_type "cosine" \ 45 | --logging_steps 1 \ 46 | --report_to "none" \ 47 | --model_max_length 2048 \ 48 | --lazy_preprocess True \ 49 | --use_lora \ 50 | --gradient_checkpointing \ 51 | --deepspeed ds_config_zero2.json -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/finetune_lora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | 6 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly 7 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 8 | # See the section for finetuning in README for more information. 9 | DATA="path_to_data" 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | python finetune.py \ 14 | --model_name_or_path $MODEL \ 15 | --data_path $DATA \ 16 | --bf16 True \ 17 | --fix_vit True \ 18 | --output_dir output_qwen \ 19 | --num_train_epochs 5 \ 20 | --per_device_train_batch_size 1 \ 21 | --per_device_eval_batch_size 1 \ 22 | --gradient_accumulation_steps 8 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 1000 \ 26 | --save_total_limit 10 \ 27 | --learning_rate 1e-5 \ 28 | --weight_decay 0.1 \ 29 | --adam_beta2 0.95 \ 30 | --warmup_ratio 0.01 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --report_to "none" \ 34 | --model_max_length 2048 \ 35 | --lazy_preprocess True \ 36 | --gradient_checkpointing \ 37 | --use_lora -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/finetune_qlora_ds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | GPUS_PER_NODE=8 6 | NNODES=1 7 | NODE_RANK=0 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=6001 10 | 11 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly 12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 13 | # See the section for finetuning in README for more information. 14 | DATA="path_to_data" 15 | 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | # Remember to use --fp16 instead of --bf16 due to autogptq 26 | torchrun $DISTRIBUTED_ARGS finetune.py \ 27 | --model_name_or_path $MODEL \ 28 | --data_path $DATA \ 29 | --fp16 True \ 30 | --fix_vit True \ 31 | --output_dir output_qwen \ 32 | --num_train_epochs 5 \ 33 | --per_device_train_batch_size 2 \ 34 | --per_device_eval_batch_size 1 \ 35 | --gradient_accumulation_steps 8 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 1000 \ 39 | --save_total_limit 10 \ 40 | --learning_rate 1e-5 \ 41 | --weight_decay 0.1 \ 42 | --adam_beta2 0.95 \ 43 | --warmup_ratio 0.01 \ 44 | --lr_scheduler_type "cosine" \ 45 | --logging_steps 1 \ 46 | --report_to "none" \ 47 | --model_max_length 2048 \ 48 | --lazy_preprocess True \ 49 | --use_lora \ 50 | --q_lora \ 51 | --gradient_checkpointing \ 52 | --deepspeed finetune/ds_config_zero2.json -------------------------------------------------------------------------------- /finetune/scripts/qwenvl/finetune_qlora_single_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1 3 | DIR=`pwd` 4 | 5 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. 7 | # See the section for finetuning in README for more information. 8 | DATA="path_to_data" 9 | 10 | export CUDA_VISIBLE_DEVICES=0 11 | 12 | # Remember to use --fp16 instead of --bf16 due to autogptq 13 | python finetune.py \ 14 | --model_name_or_path $MODEL \ 15 | --data_path $DATA \ 16 | --fp16 True \ 17 | --fix_vit True \ 18 | --output_dir output_qwen \ 19 | --num_train_epochs 5 \ 20 | --per_device_train_batch_size 1 \ 21 | --per_device_eval_batch_size 1 \ 22 | --gradient_accumulation_steps 8 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 1000 \ 26 | --save_total_limit 10 \ 27 | --learning_rate 1e-5 \ 28 | --weight_decay 0.1 \ 29 | --adam_beta2 0.95 \ 30 | --warmup_ratio 0.01 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --report_to "none" \ 34 | --model_max_length 2048 \ 35 | --lazy_preprocess True \ 36 | --gradient_checkpointing \ 37 | --use_lora \ 38 | --q_lora \ 39 | --deepspeed finetune/ds_config_zero2.json 40 | -------------------------------------------------------------------------------- /internvl_chat/README.md: -------------------------------------------------------------------------------- 1 | # InternVL-Chat 2 | 3 | This folder contains the implementation of the InternVL-Chat. 4 | 5 | ## 🛠️ Installation 6 | 7 | See [INSTALLATION.md](../INSTALLATION.md) 8 | 9 | In addition, using this codebase requires executing the following steps: 10 | 11 | - Install other requirements: 12 | 13 | ```bash 14 | pip install --upgrade pip # enable PEP 660 support 15 | pip install -e . 16 | ``` 17 | 18 | ## 📖 Documents 19 | 20 | - InternVL 2.0 21 | 22 | - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/introduction.html) 23 | - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html) 24 | - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/finetune.html) 25 | - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html) 26 | - Deployment [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/deployment.html) 27 | 28 | - InternVL 1.5 29 | 30 | - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/introduction.html) 31 | - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/quick_start.html) 32 | - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/finetune.html) 33 | - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/evaluation.html) 34 | - Deployment [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/deployment.html) 35 | 36 | - InternVL 1.2 37 | 38 | - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/introduction.html) 39 | - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/quick_start.html) 40 | - Reproduce [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/reproduce.html) 41 | - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/finetune.html) 42 | - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/evaluation.html) 43 | 44 | - InternVL 1.1 45 | 46 | - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/introduction.html) 47 | - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/quick_start.html) 48 | - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/evaluation.html) 49 | -------------------------------------------------------------------------------- /internvl_chat/internvl/dist_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import subprocess 4 | from datetime import timedelta 5 | 6 | import deepspeed 7 | import torch 8 | import torch.multiprocessing as mp 9 | from torch import distributed as dist 10 | 11 | timeout = timedelta(minutes=60) 12 | 13 | 14 | def _find_free_port(): 15 | # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501 16 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 17 | # Binding to port 0 will cause the OS to find an available port for us 18 | sock.bind(('', 0)) 19 | port = sock.getsockname()[1] 20 | sock.close() 21 | # NOTE: there is still a chance the port could be taken by other processes. 22 | return port 23 | 24 | 25 | def _is_free_port(port): 26 | ips = socket.gethostbyname_ex(socket.gethostname())[-1] 27 | ips.append('localhost') 28 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 29 | return all(s.connect_ex((ip, port)) != 0 for ip in ips) 30 | 31 | 32 | def init_dist(launcher, backend='nccl', **kwargs): 33 | if mp.get_start_method(allow_none=True) is None: 34 | mp.set_start_method('spawn') 35 | if launcher == 'pytorch': 36 | _init_dist_pytorch(backend, **kwargs) 37 | elif launcher == 'mpi': 38 | _init_dist_mpi(backend, **kwargs) 39 | elif launcher == 'slurm': 40 | _init_dist_slurm(backend, **kwargs) 41 | else: 42 | raise ValueError(f'Invalid launcher type: {launcher}') 43 | 44 | 45 | def _init_dist_pytorch(backend, **kwargs): 46 | # TODO: use local_rank instead of rank % num_gpus 47 | rank = int(os.environ['RANK']) 48 | num_gpus = torch.cuda.device_count() 49 | torch.cuda.set_device(rank % num_gpus) 50 | # dist.init_process_group(backend=backend, **kwargs) 51 | deepspeed.init_distributed(dist_backend=backend) 52 | 53 | 54 | def _init_dist_mpi(backend, **kwargs): 55 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 56 | torch.cuda.set_device(local_rank) 57 | if 'MASTER_PORT' not in os.environ: 58 | # 29500 is torch.distributed default port 59 | os.environ['MASTER_PORT'] = '29500' 60 | if 'MASTER_ADDR' not in os.environ: 61 | raise KeyError('The environment variable MASTER_ADDR is not set') 62 | os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE'] 63 | os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK'] 64 | dist.init_process_group(backend=backend, **kwargs) 65 | 66 | 67 | def _init_dist_slurm(backend, port=None): 68 | """Initialize slurm distributed training environment. 69 | 70 | If argument ``port`` is not specified, then the master port will be system 71 | environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system 72 | environment variable, then a default port ``29500`` will be used. 73 | 74 | Args: 75 | backend (str): Backend of torch.distributed. 76 | port (int, optional): Master port. Defaults to None. 77 | """ 78 | proc_id = int(os.environ['SLURM_PROCID']) 79 | ntasks = int(os.environ['SLURM_NTASKS']) 80 | node_list = os.environ['SLURM_NODELIST'] 81 | num_gpus = torch.cuda.device_count() 82 | torch.cuda.set_device(proc_id % num_gpus) 83 | addr = subprocess.getoutput( 84 | f'scontrol show hostname {node_list} | head -n1') 85 | # specify master port 86 | if port is not None: 87 | os.environ['MASTER_PORT'] = str(port) 88 | elif 'MASTER_PORT' in os.environ: 89 | pass # use MASTER_PORT in the environment variable 90 | else: 91 | # if torch.distributed default port(29500) is available 92 | # then use it, else find a free port 93 | if _is_free_port(29500): 94 | os.environ['MASTER_PORT'] = '29500' 95 | else: 96 | os.environ['MASTER_PORT'] = str(_find_free_port()) 97 | # use MASTER_ADDR in the environment variable if it already exists 98 | if 'MASTER_ADDR' not in os.environ: 99 | os.environ['MASTER_ADDR'] = addr 100 | os.environ['WORLD_SIZE'] = str(ntasks) 101 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 102 | os.environ['RANK'] = str(proc_id) 103 | # dist.init_process_group(backend=backend, timeout=timeout) 104 | deepspeed.init_distributed(dist_backend=backend) 105 | -------------------------------------------------------------------------------- /internvl_chat/internvl/model/__init__.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from internvl.model.internvl_chat import InternVLChatConfig, InternVLChatModel 5 | from transformers import AutoTokenizer 6 | 7 | 8 | def split_model(num_layers, vit_alpha=0.5): 9 | device_map = {} 10 | world_size = torch.cuda.device_count() 11 | # Since the first GPU will be used for ViT, treat it as half a GPU. 12 | num_layers_per_gpu = math.ceil(num_layers / (world_size - vit_alpha)) 13 | num_layers_per_gpu = [num_layers_per_gpu] * world_size 14 | num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * (1 - vit_alpha)) 15 | layer_cnt = 0 16 | for i, num_layer in enumerate(num_layers_per_gpu): 17 | for j in range(num_layer): 18 | device_map[f"language_model.model.layers.{layer_cnt}"] = i 19 | layer_cnt += 1 20 | device_map["vision_model"] = 0 21 | device_map["mlp1"] = 0 22 | device_map["language_model.model.tok_embeddings"] = 0 23 | device_map["language_model.model.embed_tokens"] = 0 24 | device_map["language_model.output"] = 0 25 | device_map["language_model.model.norm"] = 0 26 | device_map["language_model.lm_head"] = 0 27 | device_map[f"language_model.model.layers.{num_layers - 1}"] = 0 28 | 29 | return device_map 30 | 31 | 32 | def load_model_and_tokenizer(args): 33 | if args.auto: 34 | config = InternVLChatConfig.from_pretrained(args.checkpoint) 35 | num_hidden_layers = config.llm_config.num_hidden_layers 36 | device_map = split_model(num_hidden_layers) 37 | kwargs = {"device_map": device_map} if args.auto else {} 38 | tokenizer = AutoTokenizer.from_pretrained( 39 | args.checkpoint, trust_remote_code=True, use_fast=False 40 | ) 41 | model = InternVLChatModel.from_pretrained( 42 | args.checkpoint, 43 | low_cpu_mem_usage=True, 44 | torch_dtype=torch.bfloat16, 45 | load_in_8bit=args.load_in_8bit, 46 | load_in_4bit=args.load_in_4bit, 47 | **kwargs, 48 | ).eval() 49 | if not args.load_in_8bit and not args.load_in_4bit and not args.auto: 50 | model = model.cuda() 51 | return model, tokenizer 52 | -------------------------------------------------------------------------------- /internvl_chat/internvl/model/internvl_chat/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # InternVL 3 | # Copyright (c) 2024 OpenGVLab 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # -------------------------------------------------------- 6 | 7 | from .configuration_intern_vit import InternVisionConfig 8 | from .configuration_internvl_chat import InternVLChatConfig 9 | from .modeling_intern_vit import InternVisionModel 10 | from .modeling_internvl_chat import InternVLChatModel 11 | 12 | __all__ = ['InternVisionConfig', 'InternVisionModel', 13 | 'InternVLChatConfig', 'InternVLChatModel'] 14 | -------------------------------------------------------------------------------- /internvl_chat/internvl/model/internvl_chat/flash_attention.py: -------------------------------------------------------------------------------- 1 | # https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py 2 | import torch 3 | import torch.nn as nn 4 | from einops import rearrange 5 | 6 | try: # v1 7 | from flash_attn.flash_attn_interface import \ 8 | flash_attn_unpadded_qkvpacked_func 9 | except: # v2 10 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func 11 | 12 | from flash_attn.bert_padding import pad_input, unpad_input 13 | 14 | 15 | class FlashAttention(nn.Module): 16 | """Implement the scaled dot product attention with softmax. 17 | Arguments 18 | --------- 19 | softmax_scale: The temperature to use for the softmax attention. 20 | (default: 1/sqrt(d_keys) where d_keys is computed at 21 | runtime) 22 | attention_dropout: The dropout rate to apply to the attention 23 | (default: 0.0) 24 | """ 25 | 26 | def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): 27 | super().__init__() 28 | self.softmax_scale = softmax_scale 29 | self.dropout_p = attention_dropout 30 | 31 | def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None, 32 | max_s=None, need_weights=False): 33 | """Implements the multihead softmax attention. 34 | Arguments 35 | --------- 36 | qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None 37 | if unpadded: (nnz, 3, h, d) 38 | key_padding_mask: a bool tensor of shape (B, S) 39 | """ 40 | assert not need_weights 41 | assert qkv.dtype in [torch.float16, torch.bfloat16] 42 | assert qkv.is_cuda 43 | 44 | if cu_seqlens is None: 45 | batch_size = qkv.shape[0] 46 | seqlen = qkv.shape[1] 47 | if key_padding_mask is None: 48 | qkv = rearrange(qkv, 'b s ... -> (b s) ...') 49 | max_s = seqlen 50 | cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, 51 | device=qkv.device) 52 | output = flash_attn_unpadded_qkvpacked_func( 53 | qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, 54 | softmax_scale=self.softmax_scale, causal=causal 55 | ) 56 | output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) 57 | else: 58 | nheads = qkv.shape[-2] 59 | x = rearrange(qkv, 'b s three h d -> b s (three h d)') 60 | x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask) 61 | x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads) 62 | output_unpad = flash_attn_unpadded_qkvpacked_func( 63 | x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, 64 | softmax_scale=self.softmax_scale, causal=causal 65 | ) 66 | output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), 67 | indices, batch_size, seqlen), 68 | 'b s (h d) -> b s h d', h=nheads) 69 | else: 70 | assert max_s is not None 71 | output = flash_attn_unpadded_qkvpacked_func( 72 | qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, 73 | softmax_scale=self.softmax_scale, causal=causal 74 | ) 75 | 76 | return output, None 77 | -------------------------------------------------------------------------------- /internvl_chat/internvl/patch/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama2_flash_attn_monkey_patch import replace_llama2_attn_with_flash_attn 2 | from .llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 3 | from .llama_rmsnorm_monkey_patch import \ 4 | replace_llama_rmsnorm_with_fused_rmsnorm 5 | from .pad_data_collator import concat_pad_data_collator, pad_data_collator 6 | from .train_sampler_patch import replace_train_sampler 7 | 8 | __all__ = ['replace_llama_attn_with_flash_attn', 9 | 'replace_llama_rmsnorm_with_fused_rmsnorm', 10 | 'replace_llama2_attn_with_flash_attn', 11 | 'replace_train_sampler', 12 | 'pad_data_collator', 13 | 'concat_pad_data_collator'] 14 | -------------------------------------------------------------------------------- /internvl_chat/internvl/patch/llama_rmsnorm_monkey_patch.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | 4 | def replace_llama_rmsnorm_with_fused_rmsnorm(): 5 | try: 6 | from functools import partial 7 | 8 | from apex.normalization import FusedRMSNorm 9 | LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6) # noqa 10 | transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm 11 | print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm') 12 | except ImportError: 13 | # using the normal LlamaRMSNorm 14 | pass 15 | except Exception: 16 | print('discovered apex but it failed to load, falling back to LlamaRMSNorm') 17 | pass 18 | -------------------------------------------------------------------------------- /internvl_chat/internvl/train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/internvl_chat/internvl/train/__init__.py -------------------------------------------------------------------------------- /internvl_chat/internvl/train/constants.py: -------------------------------------------------------------------------------- 1 | IMG_CONTEXT_TOKEN = '' 2 | IMG_START_TOKEN = '' 3 | IMG_END_TOKEN = '' 4 | QUAD_START_TOKEN = '' 5 | QUAD_END_TOKEN = '' 6 | REF_START_TOKEN = '' 7 | REF_END_TOKEN = '' 8 | BOX_START_TOKEN = '' 9 | BOX_END_TOKEN = '' 10 | IMAGENET_MEAN = (0.485, 0.456, 0.406) 11 | IMAGENET_STD = (0.229, 0.224, 0.225) 12 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073) 13 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711) 14 | SIGLIP_MEAN = (0.5, 0.5, 0.5) 15 | SIGLIP_STD = (0.5, 0.5, 0.5) 16 | -------------------------------------------------------------------------------- /internvl_chat/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "internvl_chat" 7 | version = "2.0.0" 8 | description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | ] 17 | 18 | [project.urls] 19 | "Homepage" = "https://github.com/OpenGVLab/InternVL" 20 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues" 21 | 22 | [tool.setuptools.packages.find] 23 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"] 24 | 25 | [tool.wheel] 26 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"] 27 | -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_flickr30k_rerank.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_rerank": { 3 | "root": "RagVL/finetune/tasks", 4 | "annotation": "RagVL/finetune/tasks/Flickr30k_one_train_rerank_clip_negatives_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 58000 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_mmqa_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_qa": { 3 | "root": "RagVL/finetune/tasks", 4 | "annotation": "RagVL/finetune/tasks/MMQA_train_QA_single_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 2099 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_mmqa_rerank.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_rerank": { 3 | "root": "RagVL/finetune/tasks", 4 | "annotation": "RagVL/finetune/tasks/MMQA_train_rerank_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 19432 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_mscoco_rerank.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_rerank": { 3 | "root": "RagVL/finetune/tasks", 4 | "annotation": "RagVL/finetune/tasks/MSCOCO_one_train_rerank_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 40000 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_webqa_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_qa": { 3 | "root": "finetune/tasks", 4 | "annotation": "finetune/tasks/WebQA_train_QA_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 15163 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/data/internvl_2_finetune_webqa_rerank.json: -------------------------------------------------------------------------------- 1 | { 2 | "webqa_rerank": { 3 | "root": "RagVL/finetune/tasks", 4 | "annotation": "RagVL/finetune/tasks/WebQA_train_rerank_internvl.jsonl", 5 | "data_augment": false, 6 | "repeat_time": 1, 7 | "length": 32990 8 | } 9 | } -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-16} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-128} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 16 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 2 28 | # total batch size: 128 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./pretrained/InternVL-Chat-V1-2-Plus" \ 41 | --conv_style "Hermes-2" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 1 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.0 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone True \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 1 \ 62 | --learning_rate 1e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 2048 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size False \ 72 | --use_thumbnail False \ 73 | --ps_version 'v1' \ 74 | --deepspeed "zero_stage3_config_34b.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL-Chat-V1-2-Plus" \ 33 | --conv_style "Hermes-2" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 1 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 1e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 2048 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size False \ 65 | --use_thumbnail False \ 66 | --ps_version 'v1' \ 67 | --deepspeed "zero_stage3_config_34b.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.2/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-64} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-512} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 64 26 | # batch size per gpu: 8 27 | # gradient accumulation steps: 1 28 | # total batch size: 512 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --vision_path "./pretrained/InternViT-6B-448px-V1-2" \ 41 | --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \ 42 | --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \ 43 | --conv_style "Hermes-2" \ 44 | --output_dir ${OUTPUT_DIR} \ 45 | --meta_path "./shell/data/internvl_1_2_finetune.json" \ 46 | --overwrite_output_dir True \ 47 | --force_image_size 448 \ 48 | --down_sample_ratio 0.5 \ 49 | --drop_path_rate 0.4 \ 50 | --freeze_llm False \ 51 | --freeze_mlp False \ 52 | --freeze_backbone False \ 53 | --vision_select_layer -1 \ 54 | --dataloader_num_workers 4 \ 55 | --bf16 True \ 56 | --num_train_epochs 1 \ 57 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 58 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 59 | --evaluation_strategy "no" \ 60 | --save_strategy "steps" \ 61 | --save_steps 200 \ 62 | --save_total_limit 3 \ 63 | --learning_rate 1e-5 \ 64 | --weight_decay 0.05 \ 65 | --warmup_ratio 0.03 \ 66 | --lr_scheduler_type "cosine" \ 67 | --logging_steps 1 \ 68 | --max_seq_length 2048 \ 69 | --do_train True \ 70 | --grad_checkpoint True \ 71 | --group_by_length True \ 72 | --deepspeed "zero_stage3_config_34b.json" \ 73 | --report_to "tensorboard" \ 74 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 75 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.01 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 4e-5 \ 56 | --weight_decay 0.01 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage1_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.4 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 2e-5 \ 55 | --weight_decay 0.05 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage3_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 2e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage3_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \ 33 | --conv_style "phi3-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.05 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \ 33 | --conv_style "phi3-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 12 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 4e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage1_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-256} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-1024} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 256 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 1 28 | # total batch size: 1024 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./work_dirs/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain" \ 41 | --conv_style "Hermes-2" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "path/to/finetune/data.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 12 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.4 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone False \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 3 \ 62 | --learning_rate 2e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 4096 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage3_config_34b.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-256} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-2048} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 256 26 | # batch size per gpu: 2 27 | # gradient accumulation steps: 4 28 | # total batch size: 2048 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_pretrain.py \ 40 | --vision_path "./pretrained/InternViT-6B-448px-V1-5" \ 41 | --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \ 42 | --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \ 43 | --conv_style "Hermes-2" \ 44 | --output_dir ${OUTPUT_DIR} \ 45 | --meta_path "path/to/pretrain/data.json" \ 46 | --overwrite_output_dir True \ 47 | --force_image_size 448 \ 48 | --max_dynamic_patch 12 \ 49 | --down_sample_ratio 0.5 \ 50 | --drop_path_rate 0.0 \ 51 | --freeze_llm True \ 52 | --freeze_mlp False \ 53 | --freeze_backbone True \ 54 | --vision_select_layer -1 \ 55 | --dataloader_num_workers 4 \ 56 | --bf16 True \ 57 | --num_train_epochs 1 \ 58 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 59 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 60 | --evaluation_strategy "no" \ 61 | --save_strategy "steps" \ 62 | --save_steps 200 \ 63 | --save_total_limit 3 \ 64 | --learning_rate 1e-4 \ 65 | --weight_decay 0.05 \ 66 | --warmup_steps 100 \ 67 | --lr_scheduler_type "cosine" \ 68 | --logging_steps 1 \ 69 | --max_seq_length 4096 \ 70 | --do_train True \ 71 | --grad_checkpoint True \ 72 | --group_by_length False \ 73 | --dynamic_image_size True \ 74 | --use_thumbnail True \ 75 | --ps_version 'v2' \ 76 | --deepspeed "zero_stage3_config_34b.json" \ 77 | --report_to "tensorboard" \ 78 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 79 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-128} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-1024} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 128 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 2 28 | # total batch size: 1024 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain" \ 41 | --conv_style "internlm2-chat" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "path/to/finetune/data.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 12 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.1 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone False \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 3 \ 62 | --learning_rate 4e-5 \ 63 | --weight_decay 0.01 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 8192 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage1_config.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-128} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-2048} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 128 26 | # batch size per gpu: 8 27 | # gradient accumulation steps: 2 28 | # total batch size: 2048 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_pretrain.py \ 40 | --vision_path "./pretrained/InternViT-300M-448px" \ 41 | --llm_path "./pretrained/internlm2-chat-1_8b" \ 42 | --conv_style "internlm2-chat" \ 43 | --output_dir ${OUTPUT_DIR} \ 44 | --meta_path "path/to/pretrain/data.json" \ 45 | --overwrite_output_dir True \ 46 | --force_image_size 448 \ 47 | --max_dynamic_patch 12 \ 48 | --down_sample_ratio 0.5 \ 49 | --drop_path_rate 0.1 \ 50 | --freeze_llm True \ 51 | --freeze_mlp False \ 52 | --freeze_backbone False \ 53 | --vision_select_layer -1 \ 54 | --dataloader_num_workers 4 \ 55 | --bf16 True \ 56 | --num_train_epochs 1 \ 57 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 58 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 59 | --evaluation_strategy "no" \ 60 | --save_strategy "steps" \ 61 | --save_steps 200 \ 62 | --save_total_limit 3 \ 63 | --learning_rate 2e-5 \ 64 | --weight_decay 0.01 \ 65 | --warmup_steps 100 \ 66 | --lr_scheduler_type "cosine" \ 67 | --logging_steps 1 \ 68 | --max_seq_length 4096 \ 69 | --do_train True \ 70 | --grad_checkpoint True \ 71 | --group_by_length False \ 72 | --dynamic_image_size True \ 73 | --use_thumbnail True \ 74 | --ps_version 'v2' \ 75 | --deepspeed "zero_stage1_config.json" \ 76 | --report_to "tensorboard" \ 77 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 78 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-256} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-1024} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 256 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 1 28 | # total batch size: 1024 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain" \ 41 | --conv_style "internlm2-chat" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "path/to/finetune/data.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 12 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.4 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone False \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 3 \ 62 | --learning_rate 2e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 4096 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage3_config.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-256} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-2048} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 256 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 2 28 | # total batch size: 2048 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_pretrain.py \ 40 | --vision_path "./pretrained/InternViT-6B-448px-V1-5" \ 41 | --llm_path "./pretrained/internlm2-chat-20b" \ 42 | --conv_style "internlm2-chat" \ 43 | --output_dir ${OUTPUT_DIR} \ 44 | --meta_path "path/to/pretrain/data.json" \ 45 | --overwrite_output_dir True \ 46 | --force_image_size 448 \ 47 | --max_dynamic_patch 12 \ 48 | --down_sample_ratio 0.5 \ 49 | --drop_path_rate 0.2 \ 50 | --freeze_llm True \ 51 | --freeze_mlp False \ 52 | --freeze_backbone False \ 53 | --vision_select_layer -1 \ 54 | --dataloader_num_workers 4 \ 55 | --bf16 True \ 56 | --num_train_epochs 1 \ 57 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 58 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 59 | --evaluation_strategy "no" \ 60 | --save_strategy "steps" \ 61 | --save_steps 200 \ 62 | --save_total_limit 3 \ 63 | --learning_rate 1e-5 \ 64 | --weight_decay 0.05 \ 65 | --warmup_steps 100 \ 66 | --lr_scheduler_type "cosine" \ 67 | --logging_steps 1 \ 68 | --max_seq_length 4096 \ 69 | --do_train True \ 70 | --grad_checkpoint True \ 71 | --group_by_length False \ 72 | --dynamic_image_size True \ 73 | --use_thumbnail True \ 74 | --ps_version 'v2' \ 75 | --deepspeed "zero_stage3_config.json" \ 76 | --report_to "tensorboard" \ 77 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 78 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-128} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-1024} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 128 26 | # batch size per gpu: 4 27 | # gradient accumulation steps: 2 28 | # total batch size: 1024 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./work_dirs/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune" \ 41 | --conv_style "phi3-chat" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "path/to/finetune/data.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 12 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.1 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone False \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 3 \ 62 | --learning_rate 4e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 8192 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage1_config.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-128} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-2048} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 128 26 | # batch size per gpu: 8 27 | # gradient accumulation steps: 2 28 | # total batch size: 2048 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_pretrain.py \ 40 | --vision_path "./pretrained/InternViT-300M-448px" \ 41 | --llm_path "./pretrained/Phi-3-mini-128k-instruct" \ 42 | --conv_style "phi3-chat" \ 43 | --output_dir ${OUTPUT_DIR} \ 44 | --meta_path "path/to/pretrain/data.json" \ 45 | --overwrite_output_dir True \ 46 | --force_image_size 448 \ 47 | --max_dynamic_patch 12 \ 48 | --down_sample_ratio 0.5 \ 49 | --drop_path_rate 0.0 \ 50 | --freeze_llm True \ 51 | --freeze_mlp False \ 52 | --freeze_backbone True \ 53 | --vision_select_layer -1 \ 54 | --dataloader_num_workers 4 \ 55 | --bf16 True \ 56 | --num_train_epochs 1 \ 57 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 58 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 59 | --evaluation_strategy "no" \ 60 | --save_strategy "steps" \ 61 | --save_steps 200 \ 62 | --save_total_limit 3 \ 63 | --learning_rate 2e-4 \ 64 | --weight_decay 0.05 \ 65 | --warmup_steps 100 \ 66 | --lr_scheduler_type "cosine" \ 67 | --logging_steps 1 \ 68 | --max_seq_length 4096 \ 69 | --do_train True \ 70 | --grad_checkpoint True \ 71 | --group_by_length False \ 72 | --dynamic_image_size True \ 73 | --use_thumbnail True \ 74 | --ps_version 'v2' \ 75 | --deepspeed "zero_stage1_config.json" \ 76 | --report_to "tensorboard" \ 77 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 78 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-1B" \ 33 | --conv_style "Hermes-2" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.01 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-4} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | export CUDA_VISIBLE_DEVICES=0,1,2,3 14 | 15 | OUTPUT_DIR='RagVL/checkpoints/internvl2_1b_1epoch-16batch_size-mmqa-noise-injected-lora' 16 | 17 | if [ ! -d "$OUTPUT_DIR" ]; then 18 | mkdir -p "$OUTPUT_DIR" 19 | fi 20 | 21 | # number of gpus: 4 22 | # batch size per gpu: 4 23 | # gradient accumulation steps: 1 24 | # total batch size: 16 25 | # epoch: 1 26 | torchrun \ 27 | --nnodes=1 \ 28 | --node_rank=0 \ 29 | --master_addr=127.0.0.1 \ 30 | --nproc_per_node=${GPUS} \ 31 | --master_port=${MASTER_PORT} \ 32 | RagVL/internvl_chat/internvl/train/internvl_chat_finetune.py \ 33 | --model_name_or_path "OpenGVLab/InternVL2-1B" \ 34 | --conv_style "Hermes-2" \ 35 | --output_dir ${OUTPUT_DIR} \ 36 | --meta_path "RagVL/internvl_chat/shell/data/internvl_2_finetune_mmqa_qa.json" \ 37 | --overwrite_output_dir True \ 38 | --force_image_size 448 \ 39 | --max_dynamic_patch 6 \ 40 | --down_sample_ratio 0.5 \ 41 | --drop_path_rate 0.0 \ 42 | --freeze_llm True \ 43 | --freeze_mlp True \ 44 | --freeze_backbone True \ 45 | --use_llm_lora 16 \ 46 | --vision_select_layer -1 \ 47 | --dataloader_num_workers 4 \ 48 | --bf16 True \ 49 | --num_train_epochs 1 \ 50 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 51 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 52 | --evaluation_strategy "no" \ 53 | --save_strategy "steps" \ 54 | --save_steps 200 \ 55 | --save_total_limit 1 \ 56 | --learning_rate 4e-5 \ 57 | --weight_decay 0.01 \ 58 | --warmup_ratio 0.03 \ 59 | --lr_scheduler_type "cosine" \ 60 | --logging_steps 1 \ 61 | --max_seq_length 4096 \ 62 | --do_train True \ 63 | --grad_checkpoint True \ 64 | --group_by_length True \ 65 | --dynamic_image_size True \ 66 | --use_thumbnail True \ 67 | --ps_version 'v2' \ 68 | --deepspeed "RagVL/internvl_chat/zero_stage1_config.json" \ 69 | --report_to "tensorboard" \ 70 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 71 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 2 22 | # gradient accumulation steps: 8 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-26B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.4 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 2e-5 \ 55 | --weight_decay 0.05 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage3_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 2 22 | # gradient accumulation steps: 4 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-26B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 2e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage3_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-2B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.01 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-4} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | export CUDA_VISIBLE_DEVICES=0,1,2,3 14 | 15 | OUTPUT_DIR='RagVL/checkpoints/internvl2_2b_1epoch-16batch_size-flickr30k-one-reranker-caption-clip-negatives-lora' 16 | 17 | if [ ! -d "$OUTPUT_DIR" ]; then 18 | mkdir -p "$OUTPUT_DIR" 19 | fi 20 | 21 | # number of gpus: 4 22 | # batch size per gpu: 4 23 | # gradient accumulation steps: 1 24 | # total batch size: 16 25 | # epoch: 1 26 | torchrun \ 27 | --nnodes=1 \ 28 | --node_rank=0 \ 29 | --master_addr=127.0.0.1 \ 30 | --nproc_per_node=${GPUS} \ 31 | --master_port=${MASTER_PORT} \ 32 | RagVL/internvl_chat/internvl/train/internvl_chat_finetune.py \ 33 | --model_name_or_path "OpenGVLab/InternVL2-2B" \ 34 | --conv_style "internlm2-chat" \ 35 | --output_dir ${OUTPUT_DIR} \ 36 | --meta_path "RagVL/internvl_chat/shell/data/internvl_2_finetune_flickr30k_rerank.json" \ 37 | --overwrite_output_dir True \ 38 | --force_image_size 448 \ 39 | --max_dynamic_patch 6 \ 40 | --down_sample_ratio 0.5 \ 41 | --drop_path_rate 0.0 \ 42 | --freeze_llm True \ 43 | --freeze_mlp True \ 44 | --freeze_backbone True \ 45 | --use_llm_lora 16 \ 46 | --vision_select_layer -1 \ 47 | --dataloader_num_workers 4 \ 48 | --bf16 True \ 49 | --num_train_epochs 1 \ 50 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 51 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 52 | --evaluation_strategy "no" \ 53 | --save_strategy "steps" \ 54 | --save_steps 200 \ 55 | --save_total_limit 1 \ 56 | --learning_rate 4e-5 \ 57 | --weight_decay 0.01 \ 58 | --warmup_ratio 0.03 \ 59 | --lr_scheduler_type "cosine" \ 60 | --logging_steps 1 \ 61 | --max_seq_length 4096 \ 62 | --do_train True \ 63 | --grad_checkpoint True \ 64 | --group_by_length True \ 65 | --dynamic_image_size True \ 66 | --use_thumbnail True \ 67 | --ps_version 'v2' \ 68 | --deepspeed "RagVL/internvl_chat/zero_stage1_config.json" \ 69 | --report_to "tensorboard" \ 70 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 71 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-512} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 16 23 | # total batch size: 512 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-2B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/coco_caption.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 128 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 4e-5 \ 56 | --weight_decay 0.01 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage1_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-16} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-128} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 16 26 | # batch size per gpu: 2 27 | # gradient accumulation steps: 4 28 | # total batch size: 128 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./pretrained/InternVL2-40B" \ 41 | --conv_style "Hermes-2" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 6 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.4 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone True \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 1 \ 62 | --learning_rate 2e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 4096 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage3_config_34b.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 2 22 | # gradient accumulation steps: 4 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-40B" \ 33 | --conv_style "Hermes-2" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 2e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage3_config_34b.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-4B" \ 33 | --conv_style "phi3-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.05 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-4B" \ 33 | --conv_style "phi3-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 4e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage1_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | PARTITION=${PARTITION:-"INTERN2"} 4 | GPUS=${GPUS:-32} 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} 7 | NODES=$((GPUS / GPUS_PER_NODE)) 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10} 9 | SRUN_ARGS=${SRUN_ARGS:-""} 10 | BATCH_SIZE=${BATCH_SIZE:-128} 11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1} 12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 13 | 14 | 15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 16 | export MASTER_PORT=34229 17 | export TF_CPP_MIN_LOG_LEVEL=3 18 | 19 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full' 20 | 21 | if [ ! -d "$OUTPUT_DIR" ]; then 22 | mkdir -p "$OUTPUT_DIR" 23 | fi 24 | 25 | # number of gpus: 32 26 | # batch size per gpu: 1 27 | # gradient accumulation steps: 4 28 | # total batch size: 128 29 | # epoch: 1 30 | srun -p ${PARTITION} \ 31 | --gres=gpu:${GPUS_PER_NODE} \ 32 | --nodes=${NODES} \ 33 | --ntasks=${GPUS} \ 34 | --ntasks-per-node=${GPUS_PER_NODE} \ 35 | --cpus-per-task=${CPUS_PER_TASK} \ 36 | --kill-on-bad-exit=1 \ 37 | --quotatype=${QUOTA_TYPE} \ 38 | ${SRUN_ARGS} \ 39 | python -u internvl/train/internvl_chat_finetune.py \ 40 | --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \ 41 | --conv_style "internlm2-chat" \ 42 | --output_dir ${OUTPUT_DIR} \ 43 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 44 | --overwrite_output_dir True \ 45 | --force_image_size 448 \ 46 | --max_dynamic_patch 6 \ 47 | --down_sample_ratio 0.5 \ 48 | --drop_path_rate 0.4 \ 49 | --freeze_llm False \ 50 | --freeze_mlp False \ 51 | --freeze_backbone True \ 52 | --vision_select_layer -1 \ 53 | --dataloader_num_workers 4 \ 54 | --bf16 True \ 55 | --num_train_epochs 1 \ 56 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 57 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 58 | --evaluation_strategy "no" \ 59 | --save_strategy "steps" \ 60 | --save_steps 200 \ 61 | --save_total_limit 1 \ 62 | --learning_rate 2e-5 \ 63 | --weight_decay 0.05 \ 64 | --warmup_ratio 0.03 \ 65 | --lr_scheduler_type "cosine" \ 66 | --logging_steps 1 \ 67 | --max_seq_length 4096 \ 68 | --do_train True \ 69 | --grad_checkpoint True \ 70 | --group_by_length True \ 71 | --dynamic_image_size True \ 72 | --use_thumbnail True \ 73 | --ps_version 'v2' \ 74 | --deepspeed "zero_stage3_config_100b.json" \ 75 | --report_to "tensorboard" \ 76 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 77 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 1 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 2e-5 \ 56 | --weight_decay 0.05 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage3_config_100b.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-8} 4 | BATCH_SIZE=${BATCH_SIZE:-128} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 8 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 4 23 | # total batch size: 128 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-8B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.1 \ 41 | --freeze_llm False \ 42 | --freeze_mlp False \ 43 | --freeze_backbone True \ 44 | --vision_select_layer -1 \ 45 | --dataloader_num_workers 4 \ 46 | --bf16 True \ 47 | --num_train_epochs 1 \ 48 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 49 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 50 | --evaluation_strategy "no" \ 51 | --save_strategy "steps" \ 52 | --save_steps 200 \ 53 | --save_total_limit 1 \ 54 | --learning_rate 4e-5 \ 55 | --weight_decay 0.01 \ 56 | --warmup_ratio 0.03 \ 57 | --lr_scheduler_type "cosine" \ 58 | --logging_steps 1 \ 59 | --max_seq_length 4096 \ 60 | --do_train True \ 61 | --grad_checkpoint True \ 62 | --group_by_length True \ 63 | --dynamic_image_size True \ 64 | --use_thumbnail True \ 65 | --ps_version 'v2' \ 66 | --deepspeed "zero_stage1_config.json" \ 67 | --report_to "tensorboard" \ 68 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 69 | -------------------------------------------------------------------------------- /internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | GPUS=${GPUS:-2} 4 | BATCH_SIZE=${BATCH_SIZE:-16} 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4} 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) 7 | 8 | 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)" 10 | export MASTER_PORT=34229 11 | export TF_CPP_MIN_LOG_LEVEL=3 12 | export LAUNCHER=pytorch 13 | 14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora' 15 | 16 | if [ ! -d "$OUTPUT_DIR" ]; then 17 | mkdir -p "$OUTPUT_DIR" 18 | fi 19 | 20 | # number of gpus: 2 21 | # batch size per gpu: 4 22 | # gradient accumulation steps: 2 23 | # total batch size: 16 24 | # epoch: 1 25 | torchrun \ 26 | --nnodes=1 \ 27 | --node_rank=0 \ 28 | --master_addr=127.0.0.1 \ 29 | --nproc_per_node=${GPUS} \ 30 | --master_port=${MASTER_PORT} \ 31 | internvl/train/internvl_chat_finetune.py \ 32 | --model_name_or_path "./pretrained/InternVL2-8B" \ 33 | --conv_style "internlm2-chat" \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ 36 | --overwrite_output_dir True \ 37 | --force_image_size 448 \ 38 | --max_dynamic_patch 6 \ 39 | --down_sample_ratio 0.5 \ 40 | --drop_path_rate 0.0 \ 41 | --freeze_llm True \ 42 | --freeze_mlp True \ 43 | --freeze_backbone True \ 44 | --use_llm_lora 16 \ 45 | --vision_select_layer -1 \ 46 | --dataloader_num_workers 4 \ 47 | --bf16 True \ 48 | --num_train_epochs 1 \ 49 | --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ 50 | --gradient_accumulation_steps ${GRADIENT_ACC} \ 51 | --evaluation_strategy "no" \ 52 | --save_strategy "steps" \ 53 | --save_steps 200 \ 54 | --save_total_limit 1 \ 55 | --learning_rate 4e-5 \ 56 | --weight_decay 0.01 \ 57 | --warmup_ratio 0.03 \ 58 | --lr_scheduler_type "cosine" \ 59 | --logging_steps 1 \ 60 | --max_seq_length 4096 \ 61 | --do_train True \ 62 | --grad_checkpoint True \ 63 | --group_by_length True \ 64 | --dynamic_image_size True \ 65 | --use_thumbnail True \ 66 | --ps_version 'v2' \ 67 | --deepspeed "zero_stage1_config.json" \ 68 | --report_to "tensorboard" \ 69 | 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" 70 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage1_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 1, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e9, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e9, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": true 41 | } 42 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage2_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 2, 4 | "allgather_partitions": true, 5 | "allgather_bucket_size": 1e8, 6 | "overlap_comm": true, 7 | "reduce_scatter": true, 8 | "reduce_bucket_size": 1e8, 9 | "contiguous_gradients": true 10 | }, 11 | "fp16": { 12 | "enabled": "auto", 13 | "auto_cast": true, 14 | "loss_scale": 0, 15 | "initial_scale_power": 32, 16 | "loss_scale_window": 1000, 17 | "hysteresis": 2, 18 | "min_loss_scale": 1 19 | }, 20 | "bf16": { 21 | "enabled": "auto" 22 | }, 23 | "optimizer": { 24 | "type": "AdamW", 25 | "params": { 26 | "lr": "auto", 27 | "betas": [ 28 | 0.9, 29 | 0.999 30 | ], 31 | "eps": 1e-8, 32 | "weight_decay": "auto" 33 | } 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 2000, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } 42 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e7, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_100b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e4, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_34b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e5, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /internvl_chat/zero_stage3_config_70b.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 3, 4 | "overlap_comm": true, 5 | "contiguous_gradients": true, 6 | "sub_group_size": 1e9, 7 | "reduce_bucket_size": 1e9, 8 | "stage3_prefetch_bucket_size": 1e9, 9 | "stage3_param_persistence_threshold": 1e5, 10 | "stage3_max_live_parameters": 1e9, 11 | "stage3_max_reuse_distance": 1e9, 12 | "stage3_gather_16bit_weights_on_model_save": true 13 | }, 14 | "fp16": { 15 | "enabled": "auto", 16 | "auto_cast": true, 17 | "loss_scale": 0, 18 | "initial_scale_power": 32, 19 | "loss_scale_window": 1000, 20 | "hysteresis": 2, 21 | "min_loss_scale": 1 22 | }, 23 | "bf16": { 24 | "enabled": "auto" 25 | }, 26 | "optimizer": { 27 | "type": "AdamW", 28 | "params": { 29 | "lr": "auto", 30 | "betas": [ 31 | 0.9, 32 | 0.999 33 | ], 34 | "eps": 1e-8, 35 | "weight_decay": "auto" 36 | } 37 | }, 38 | "gradient_accumulation_steps": "auto", 39 | "gradient_clipping": "auto", 40 | "steps_per_print": 2000, 41 | "train_batch_size": "auto", 42 | "train_micro_batch_size_per_gpu": "auto", 43 | "wall_clock_breakdown": true 44 | } 45 | -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model.language_model.llava_llama import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) 65 | -------------------------------------------------------------------------------- /llava/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | except: 6 | pass 7 | -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, \ 21 | MptConfig, MptForCausalLM, MptModel 22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 23 | 24 | 25 | class LlavaMptConfig(MptConfig): 26 | model_type = "llava_mpt" 27 | 28 | 29 | class LlavaMptModel(LlavaMetaModel, MptModel): 30 | config_class = LlavaMptConfig 31 | 32 | def __init__(self, config: MptConfig): 33 | config.hidden_size = config.d_model 34 | super(LlavaMptModel, self).__init__(config) 35 | 36 | def embed_tokens(self, x): 37 | return self.wte(x) 38 | 39 | 40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 41 | config_class = LlavaMptConfig 42 | supports_gradient_checkpointing = True 43 | 44 | def __init__(self, config): 45 | super(MptForCausalLM, self).__init__(config) 46 | 47 | self.transformer = LlavaMptModel(config) 48 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.transformer 55 | 56 | def _set_gradient_checkpointing(self, module, value=False): 57 | if isinstance(module, LlavaMptModel): 58 | module.gradient_checkpointing = value 59 | 60 | def forward( 61 | self, 62 | input_ids: Optional[torch.LongTensor] = None, 63 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 64 | attention_mask: Optional[torch.Tensor] = None, 65 | inputs_embeds: Optional[torch.Tensor] = None, 66 | labels: Optional[torch.Tensor] = None, 67 | use_cache: Optional[bool] = None, 68 | output_attentions: Optional[bool] = None, 69 | output_hidden_states: Optional[bool] = None, 70 | return_dict: Optional[bool] = None, 71 | images=None): 72 | 73 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 74 | 75 | return super().forward( 76 | input_ids, 77 | past_key_values=past_key_values, 78 | attention_mask=attention_mask, 79 | inputs_embeds=inputs_embeds, 80 | labels=labels, 81 | use_cache=use_cache, 82 | output_attentions=output_attentions, 83 | output_hidden_states=output_hidden_states, 84 | return_dict=return_dict, 85 | ) 86 | 87 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 88 | images = kwargs.pop("images", None) 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 91 | ) 92 | _inputs['images'] = images 93 | return _inputs 94 | 95 | 96 | AutoConfig.register("llava_mpt", LlavaMptConfig) 97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 98 | -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | # train() 6 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /merge_lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from internvl_chat.internvl.model.internvl_chat import InternVLChatModel 3 | from transformers import AutoTokenizer 4 | 5 | input_path = ( 6 | "checkpoints/web/internvl2_2b_1epoch-16batch_size-webqa-reranker-caption-lora" 7 | ) 8 | output_path = ( 9 | "checkpoints/web/internvl2_2b_1epoch-16batch_size-webqa-reranker-caption-lora-merge" 10 | ) 11 | 12 | print("Loading model...") 13 | model = InternVLChatModel.from_pretrained( 14 | input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 15 | ).eval() 16 | print("Loading tokenizer...") 17 | tokenizer = AutoTokenizer.from_pretrained(input_path, trust_remote_code=True) 18 | 19 | if model.config.use_backbone_lora: 20 | model.vision_model.merge_and_unload() 21 | model.vision_model = model.vision_model.model 22 | model.config.use_backbone_lora = 0 23 | if model.config.use_llm_lora: 24 | model.language_model.merge_and_unload() 25 | model.language_model = model.language_model.model 26 | model.config.use_llm_lora = 0 27 | 28 | print("Saving model...") 29 | model.save_pretrained(output_path) 30 | print("Saving tokenizer...") 31 | tokenizer.save_pretrained(output_path) 32 | print("Done!") 33 | -------------------------------------------------------------------------------- /mmqa_oracle.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import ipdb 3 | import json 4 | from tqdm import tqdm 5 | import numpy as np 6 | 7 | from utils.metrics import mmqa_metrics_approx 8 | from utils.model_series import load_generator 9 | from utils.utils import infer 10 | import argparse 11 | 12 | 13 | ############### CLIP + Rerank ############### 14 | def baseline_generate( 15 | val_dataset, 16 | generator_path, 17 | tokenizer, 18 | image_processor, 19 | generator_model, 20 | ): 21 | acc_scores = {"ALL": []} 22 | 23 | with open("datasets/MMQA_ImageQ_metadata.json", "r") as f: 24 | metadata = json.load(f) 25 | 26 | for datum in tqdm(val_dataset): 27 | qid = datum["qid"] 28 | question = datum["question"] 29 | answer = datum["answers"][0]["answer"] 30 | pos_imgs = datum["supporting_context"] 31 | 32 | pos_source = [] 33 | 34 | for item in pos_imgs: 35 | pos_source.append(item["doc_id"]) 36 | 37 | IMAGE_PATH = "" 38 | for i in range(len(pos_source)): 39 | IMAGE_PATH += "finetune/tasks/MMQA_imgs/" + metadata[pos_source[i]]["path"] 40 | if i != len(pos_source) - 1: 41 | IMAGE_PATH += "," 42 | 43 | output = infer( 44 | generator_path, 45 | IMAGE_PATH, 46 | question, 47 | generator_model, 48 | tokenizer, 49 | image_processor, 50 | from_array=False, 51 | ) 52 | 53 | if "how many" in question.lower(): 54 | qcate = "number" 55 | else: 56 | qcate = "normal" 57 | 58 | accuracy = mmqa_metrics_approx(output, answer, qcate) 59 | acc_scores["ALL"].append(accuracy) 60 | 61 | print("Generation ACC:", np.mean(acc_scores["ALL"])) 62 | 63 | 64 | def main(): 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--datasets", type=str, default="test") 67 | parser.add_argument("--generator_model", type=str, default="noise_injected_lora") 68 | parser.add_argument("--series", type=str, default="llava") 69 | args = parser.parse_args() 70 | print(args) 71 | 72 | (tokenizer, generator_model, image_processor), generator_path = load_generator( 73 | args, "mmqa" 74 | ) 75 | 76 | if args.datasets == "test": 77 | with open("datasets/MMQA_test_image.json", "r") as f: 78 | val_dataset = json.load(f) 79 | 80 | elif args.datasets == "dev": 81 | with open("datasets/MMQA_test_image.json", "r") as f: 82 | val_dataset = json.load(f) 83 | 84 | with torch.no_grad(): 85 | baseline_generate( 86 | val_dataset, 87 | generator_path, 88 | tokenizer, 89 | image_processor, 90 | generator_model, 91 | ) 92 | 93 | print(args) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /mplug_owl2/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import MPLUGOwl2LlamaForCausalLM -------------------------------------------------------------------------------- /mplug_owl2/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "./demo_logs" 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "<|image|>" 10 | -------------------------------------------------------------------------------- /mplug_owl2/evaluate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/evaluate/__init__.py -------------------------------------------------------------------------------- /mplug_owl2/evaluate/mmbench_converter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | import base64 4 | import json 5 | from PIL import Image 6 | 7 | ''' 8 | This scripts convert mmbench_dev tsv file to jsonl 9 | ''' 10 | 11 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t') 12 | 13 | global_choices = ['A', 'B', 'C', 'D'] 14 | 15 | def decode_base64_to_image(base64_string): 16 | image_data = base64.b64decode(base64_string) 17 | image = Image.open(io.BytesIO(image_data)) 18 | return image 19 | 20 | 21 | with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f: 22 | for idx in range(len(datas)): 23 | data = datas.iloc[idx] 24 | 25 | index = int(data['index']) 26 | question = data['question'] 27 | hint = data['hint'] if not pd.isna(data['hint']) else 'N/A' 28 | 29 | choices = [] 30 | for opt in global_choices: 31 | if pd.isna(data[opt]): 32 | continue 33 | choices.append(data[opt]) 34 | 35 | answer = global_choices.index(data['answer']) 36 | 37 | image = decode_base64_to_image(data['image']) 38 | image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index) 39 | 40 | f.write(json.dumps({ 41 | "index": index, 42 | "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index, 43 | "hint": hint, 44 | "question": question, 45 | "choices": choices, 46 | "answer": answer, 47 | }) + "\n") -------------------------------------------------------------------------------- /mplug_owl2/local_serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/__init__.py -------------------------------------------------------------------------------- /mplug_owl2/local_serve/examples/Rebecca_(1939_poster)_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/examples/Rebecca_(1939_poster)_Small.jpeg -------------------------------------------------------------------------------- /mplug_owl2/local_serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /mplug_owl2/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_mplug_owl2 import MPLUGOwl2LlamaForCausalLM, MPLUGOwl2QWenForCausalLM 2 | from .configuration_mplug_owl2 import MPLUGOwl2Config, MPLUGOwl2QwenConfig 3 | -------------------------------------------------------------------------------- /mplug_owl2/model/configuration_qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from transformers import PretrainedConfig 7 | 8 | 9 | class QWenConfig(PretrainedConfig): 10 | model_type = "qwen" 11 | keys_to_ignore_at_inference = ["past_key_values"] 12 | 13 | def __init__( 14 | self, 15 | multiway=False, 16 | vocab_size=151936, 17 | hidden_size=4096, 18 | num_hidden_layers=32, 19 | num_attention_heads=32, 20 | emb_dropout_prob=0.0, 21 | attn_dropout_prob=0.0, 22 | layer_norm_epsilon=1e-6, 23 | initializer_range=0.02, 24 | max_position_embeddings=8192, 25 | scale_attn_weights=True, 26 | use_cache=True, 27 | bf16=False, 28 | fp16=False, 29 | fp32=False, 30 | kv_channels=128, 31 | rotary_pct=1.0, 32 | rotary_emb_base=10000, 33 | use_dynamic_ntk=True, 34 | use_logn_attn=True, 35 | use_flash_attn="auto", 36 | intermediate_size=22016, 37 | no_bias=True, 38 | tie_word_embeddings=False, 39 | use_cache_quantization=False, 40 | use_cache_kernel=False, 41 | softmax_in_fp32=False, 42 | **kwargs, 43 | ): 44 | self.multiway = multiway 45 | self.vocab_size = vocab_size 46 | self.hidden_size = hidden_size 47 | self.intermediate_size = intermediate_size 48 | self.num_hidden_layers = num_hidden_layers 49 | self.num_attention_heads = num_attention_heads 50 | self.emb_dropout_prob = emb_dropout_prob 51 | self.attn_dropout_prob = attn_dropout_prob 52 | self.layer_norm_epsilon = layer_norm_epsilon 53 | self.initializer_range = initializer_range 54 | self.scale_attn_weights = scale_attn_weights 55 | self.use_cache = use_cache 56 | self.max_position_embeddings = max_position_embeddings 57 | self.bf16 = bf16 58 | self.fp16 = fp16 59 | self.fp32 = fp32 60 | self.kv_channels = kv_channels 61 | self.rotary_pct = rotary_pct 62 | self.rotary_emb_base = rotary_emb_base 63 | self.use_dynamic_ntk = use_dynamic_ntk 64 | self.use_logn_attn = use_logn_attn 65 | self.use_flash_attn = use_flash_attn 66 | self.no_bias = no_bias 67 | self.use_cache_quantization = use_cache_quantization 68 | self.use_cache_kernel = use_cache_kernel 69 | self.softmax_in_fp32 = softmax_in_fp32 70 | super().__init__( 71 | tie_word_embeddings=tie_word_embeddings, 72 | **kwargs 73 | ) -------------------------------------------------------------------------------- /mplug_owl2/model/multiway.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.utils.checkpoint 4 | from torch import nn 5 | 6 | 7 | class MultiwayNetwork(nn.Module): 8 | 9 | def __init__(self, module_provider, num_multiway=2, out_features=None): 10 | super(MultiwayNetwork, self).__init__() 11 | 12 | self.multiway = torch.nn.ModuleList([module_provider() for _ in range(num_multiway)]) 13 | self.out_features=out_features 14 | def forward(self, hidden_states, multiway_indices): 15 | 16 | if len(self.multiway) == 1: 17 | return self.multiway[0](hidden_states) 18 | if self.out_features: 19 | output_hidden_states = torch.empty( 20 | hidden_states.size(0), hidden_states.size(1), self.out_features, 21 | dtype=hidden_states.dtype 22 | ).to(hidden_states.device) 23 | else: 24 | output_hidden_states = torch.empty_like(hidden_states) 25 | for idx, subway in enumerate(self.multiway): 26 | local_indices = multiway_indices.eq(idx).nonzero(as_tuple=True) 27 | hidden = hidden_states[local_indices].unsqueeze(1).contiguous() 28 | if hidden.numel(): 29 | output = subway(hidden) 30 | if isinstance(output, tuple): 31 | output = output[0] 32 | output = output.squeeze(1) 33 | output_hidden_states[local_indices] = output 34 | 35 | return output_hidden_states.contiguous() -------------------------------------------------------------------------------- /mplug_owl2/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'mplug_owl2' in config and 'mplug_owl2' not in cfg.model_type: 7 | assert cfg.model_type == 'mplug_owl2' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "mplug_owl2") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) -------------------------------------------------------------------------------- /mplug_owl2/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/__init__.py -------------------------------------------------------------------------------- /mplug_owl2/serve/examples/Rebecca_(1939_poster)_Small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/examples/Rebecca_(1939_poster)_Small.jpeg -------------------------------------------------------------------------------- /mplug_owl2/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /mplug_owl2/serve/register_workers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 -------------------------------------------------------------------------------- /mplug_owl2/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | from mplug_owl2.train.llama_flash_attn_monkey_patch import ( 7 | replace_llama_attn_with_flash_attn, 8 | ) 9 | 10 | replace_llama_attn_with_flash_attn() 11 | from mplug_owl2.train.train import train 12 | 13 | if __name__ == "__main__": 14 | train() 15 | -------------------------------------------------------------------------------- /qwenvl/run_qwenvl.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | from transformers.generation import GenerationConfig 3 | from peft import AutoPeftModelForCausalLM 4 | import torch 5 | 6 | 7 | def qwen_eval_relevance(image_path, question, model, tokenizer): 8 | 9 | query_list = [{"image": image_path}] 10 | 11 | query_list.append({"text": question}) 12 | 13 | query = tokenizer.from_list_format(query_list) 14 | outputs = model.chat( 15 | tokenizer, 16 | query=query, 17 | history=None, 18 | return_dict_in_generate=True, 19 | output_scores=True, 20 | do_sample=False, 21 | ) 22 | 23 | logits = outputs.scores[0][0] 24 | 25 | probs = ( 26 | torch.nn.functional.softmax( 27 | torch.FloatTensor( 28 | [ 29 | logits[tokenizer("Yes").input_ids[0]], 30 | logits[tokenizer("No").input_ids[0]], 31 | ] 32 | ), 33 | dim=0, 34 | ) 35 | .detach() 36 | .cpu() 37 | .numpy() 38 | ) 39 | 40 | return probs[0] 41 | 42 | 43 | def qwen_chat(image_path, question, model, tokenizer): 44 | 45 | query_list = [] 46 | if image_path: 47 | for img in image_path.split(","): 48 | query_list.append({"image": img}) 49 | 50 | query_list.append({"text": question}) 51 | 52 | query = tokenizer.from_list_format(query_list) 53 | response, _ = model.chat(tokenizer, query=query, history=None, do_sample=False) 54 | 55 | return response 56 | 57 | 58 | if __name__ == "__main__": 59 | model_path = "Qwen/Qwen-VL-Chat" 60 | adapter_path = ( 61 | "../checkpoints/qwen-vl-chat-2epoch-4batch_size-webqa-reranker-caption-lora" 62 | ) 63 | 64 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 65 | 66 | mm_model = AutoPeftModelForCausalLM.from_pretrained( 67 | adapter_path, # path to the output directory 68 | device_map="auto", 69 | trust_remote_code=True, 70 | ).eval() 71 | 72 | image_path = "../assets/framework.png" 73 | query = "Image Caption: Centennial Olympic Park splash fountain\nQuestion:\"Are there more than 6 tall lamp posts surrounding the fountain at Centennial Park?\"\nBased on the image and its caption, is the image relevant to the question? Answer 'Yes' or 'No'." 74 | # ans = qwen_chat(image_path, query, mm_model, tokenizer) 75 | ans = qwen_eval_relevance(image_path, query, mm_model, tokenizer) 76 | print(ans) 77 | -------------------------------------------------------------------------------- /utils/FlagEmbedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/FlagEmbedding/__init__.py -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import Visualized_BGE -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_eva_vision_and_transforms 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\ 6 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 7 | from .openai import load_openai_model, list_openai_models 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\ 9 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 10 | from .tokenizer import SimpleTokenizer, tokenize 11 | from .transform import image_transform -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/FlagEmbedding/visual/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | "bert": { 46 | "config_names": { 47 | "context_length": "max_position_embeddings", 48 | "vocab_size": "vocab_size", 49 | "width": "hidden_size", 50 | "heads": "num_attention_heads", 51 | "layers": "num_hidden_layers", 52 | "layer_attr": "layer", 53 | "token_embeddings_attr": "embeddings" 54 | }, 55 | "pooler": "mean_pooler", 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true, 19 | "patch_dropout": 0.5 20 | }, 21 | "text_cfg": { 22 | "context_length": 77, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "xattn": true, 28 | "fusedLN": true 29 | } 30 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /utils/FlagEmbedding/visual/eva_clip/transform.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms.functional as F 6 | 7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ 8 | CenterCrop 9 | 10 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 11 | 12 | 13 | class ResizeMaxSize(nn.Module): 14 | 15 | def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0): 16 | super().__init__() 17 | if not isinstance(max_size, int): 18 | raise TypeError(f"Size should be int. Got {type(max_size)}") 19 | self.max_size = max_size 20 | self.interpolation = interpolation 21 | self.fn = min if fn == 'min' else min 22 | self.fill = fill 23 | 24 | def forward(self, img): 25 | if isinstance(img, torch.Tensor): 26 | height, width = img.shape[:2] 27 | else: 28 | width, height = img.size 29 | scale = self.max_size / float(max(height, width)) 30 | if scale != 1.0: 31 | new_size = tuple(round(dim * scale) for dim in (height, width)) 32 | img = F.resize(img, new_size, self.interpolation) 33 | pad_h = self.max_size - new_size[0] 34 | pad_w = self.max_size - new_size[1] 35 | img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill) 36 | return img 37 | 38 | 39 | def _convert_to_rgb(image): 40 | return image.convert('RGB') 41 | 42 | 43 | # class CatGen(nn.Module): 44 | # def __init__(self, num=4): 45 | # self.num = num 46 | # def mixgen_batch(image, text): 47 | # batch_size = image.shape[0] 48 | # index = np.random.permutation(batch_size) 49 | 50 | # cat_images = [] 51 | # for i in range(batch_size): 52 | # # image mixup 53 | # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:] 54 | # # text concat 55 | # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0] 56 | # text = torch.stack(text) 57 | # return image, text 58 | 59 | 60 | def image_transform( 61 | image_size: int, 62 | is_train: bool, 63 | mean: Optional[Tuple[float, ...]] = None, 64 | std: Optional[Tuple[float, ...]] = None, 65 | resize_longest_max: bool = False, 66 | fill_color: int = 0, 67 | ): 68 | mean = mean or OPENAI_DATASET_MEAN 69 | if not isinstance(mean, (list, tuple)): 70 | mean = (mean,) * 3 71 | 72 | std = std or OPENAI_DATASET_STD 73 | if not isinstance(std, (list, tuple)): 74 | std = (std,) * 3 75 | 76 | if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: 77 | # for square size, pass size as int so that Resize() uses aspect preserving shortest edge 78 | image_size = image_size[0] 79 | 80 | normalize = Normalize(mean=mean, std=std) 81 | if is_train: 82 | return Compose([ 83 | RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), 84 | _convert_to_rgb, 85 | ToTensor(), 86 | normalize, 87 | ]) 88 | else: 89 | if resize_longest_max: 90 | transforms = [ 91 | ResizeMaxSize(image_size, fill=fill_color) 92 | ] 93 | else: 94 | transforms = [ 95 | Resize(image_size, interpolation=InterpolationMode.BICUBIC), 96 | CenterCrop(image_size), 97 | ] 98 | transforms.extend([ 99 | _convert_to_rgb, 100 | ToTensor(), 101 | normalize, 102 | ]) 103 | return Compose(transforms) 104 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/__init__.py -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | from llava.mm_utils import get_model_name_from_path 2 | from llava.eval.run_llava import llava_chat, llava_eval_relevance 3 | from mplug_owl2.evaluate.run_mplug_owl2 import owl_chat, owl_eval_relevance 4 | from qwenvl.run_qwenvl import qwen_chat, qwen_eval_relevance 5 | from internvl_chat.eval.run_internvl import ( 6 | internvl_chat, 7 | internvl_eval_relevance, 8 | ) 9 | 10 | 11 | def cal_relevance(model_path, image_path, question, model, tokenizer, image_processor): 12 | 13 | if "qwen-vl" in model_path.lower(): 14 | prob = qwen_eval_relevance(image_path, question, model, tokenizer) 15 | else: 16 | args = type( 17 | "Args", 18 | (), 19 | { 20 | "model_path": model_path, 21 | "model_base": None, 22 | "model_name": get_model_name_from_path(model_path), 23 | "query": question, 24 | "conv_mode": None, 25 | "image_file": image_path, 26 | "sep": ",", 27 | "temperature": 0, 28 | "top_p": None, 29 | "num_beams": 1, 30 | "max_new_tokens": 512, 31 | }, 32 | )() 33 | 34 | if "llava" in model_path: 35 | prob = llava_eval_relevance(args, tokenizer, model, image_processor) 36 | elif "mplug-owl2" in model_path: 37 | prob = owl_eval_relevance(args, tokenizer, model, image_processor) 38 | elif "internvl" in model_path.lower(): 39 | prob = internvl_eval_relevance(args, tokenizer, model) 40 | 41 | return prob 42 | 43 | 44 | def infer( 45 | model_path, 46 | image_file, 47 | question, 48 | model, 49 | tokenizer, 50 | image_processor, 51 | from_array=False, 52 | ): 53 | if "webqa" in model_path: 54 | prompt_template = question 55 | else: 56 | prompt_template = ( 57 | f"{question}\nAnswer the question using a single word or phrase." 58 | ) 59 | 60 | if "qwen-vl" in model_path.lower(): 61 | output = qwen_chat(image_file, prompt_template, model, tokenizer) 62 | else: 63 | args = type( 64 | "Args", 65 | (), 66 | { 67 | "model_path": model_path, 68 | "model_base": None, 69 | "model_name": get_model_name_from_path(model_path), 70 | "query": prompt_template, 71 | "conv_mode": None, 72 | "image_file": image_file, 73 | "sep": ",", 74 | "temperature": 0, 75 | "top_p": None, 76 | "num_beams": 1, 77 | "max_new_tokens": 512, 78 | }, 79 | )() 80 | 81 | if "llava" in model_path: 82 | output = llava_chat( 83 | args, 84 | tokenizer, 85 | model, 86 | image_processor, 87 | from_array=from_array, 88 | ) 89 | elif "mplug-owl2" in model_path: 90 | output = owl_chat(args, tokenizer, model, image_processor) 91 | elif "internvl" in model_path.lower(): 92 | output = internvl_chat(args, tokenizer, model) 93 | 94 | return output 95 | -------------------------------------------------------------------------------- /vcd_utils/vcd_add_noise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def add_diffusion_noise(image_tensor, noise_step): 5 | num_steps = 1000 # Number of diffusion steps 6 | 7 | # decide beta in each step 8 | betas = torch.linspace(-6, 6, num_steps) 9 | betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5 10 | 11 | # decide alphas in each step 12 | alphas = 1 - betas 13 | alphas_prod = torch.cumprod(alphas, dim=0) 14 | alphas_prod_p = torch.cat( 15 | [torch.tensor([1]).float(), alphas_prod[:-1]], 0 16 | ) # p for previous 17 | alphas_bar_sqrt = torch.sqrt(alphas_prod) 18 | one_minus_alphas_bar_log = torch.log(1 - alphas_prod) 19 | one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod) 20 | 21 | def q_x(x_0, t): 22 | noise = torch.randn_like(x_0) 23 | alphas_t = alphas_bar_sqrt[t] 24 | alphas_1_m_t = one_minus_alphas_bar_sqrt[t] 25 | return alphas_t * x_0 + alphas_1_m_t * noise 26 | 27 | noise_delta = int(noise_step) # from 0-999 28 | noisy_image = image_tensor.clone() 29 | image_tensor_cd = q_x(noisy_image, noise_step) 30 | 31 | return image_tensor_cd 32 | --------------------------------------------------------------------------------