├── .gitignore
├── LICENSE
├── README.md
├── assets
    └── framework.png
├── finetune
    └── scripts
    │   ├── llava
    │       ├── finetune_task_lora.sh
    │       ├── zero2.json
    │       ├── zero3.json
    │       └── zero3_offload.json
    │   ├── mplug_owl2
    │       ├── finetune_lora.sh
    │       ├── zero2.json
    │       ├── zero3.json
    │       └── zero3_offload.json
    │   └── qwenvl
    │       ├── ds_config_zero2.json
    │       ├── ds_config_zero3.json
    │       ├── finetune.py
    │       ├── finetune_ds.sh
    │       ├── finetune_lora_ds.sh
    │       ├── finetune_lora_single_gpu.sh
    │       ├── finetune_qlora_ds.sh
    │       └── finetune_qlora_single_gpu.sh
├── flickr30k_pipeline.py
├── internvl_chat
    ├── README.md
    ├── eval
    │   └── run_internvl.py
    ├── internvl
    │   ├── conversation.py
    │   ├── dist_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── internlm2
    │   │   │   ├── configuration_internlm2.py
    │   │   │   ├── modeling_internlm2.py
    │   │   │   ├── tokenization_internlm2.py
    │   │   │   └── tokenization_internlm2_fast.py
    │   │   ├── internvl_chat
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_intern_vit.py
    │   │   │   ├── configuration_internvl_chat.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── modeling_intern_vit.py
    │   │   │   └── modeling_internvl_chat.py
    │   │   └── phi3
    │   │   │   ├── configuration_phi3.py
    │   │   │   └── modeling_phi3.py
    │   ├── patch
    │   │   ├── __init__.py
    │   │   ├── llama2_flash_attn_monkey_patch.py
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llama_rmsnorm_monkey_patch.py
    │   │   ├── pad_data_collator.py
    │   │   └── train_sampler_patch.py
    │   └── train
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── dataset.py
    │   │   ├── internvl_chat_finetune.py
    │   │   ├── internvl_chat_pretrain.py
    │   │   └── trainer_monkey_patch.py
    ├── pyproject.toml
    ├── shell
    │   ├── data
    │   │   ├── internvl_2_finetune_flickr30k_rerank.json
    │   │   ├── internvl_2_finetune_mmqa_qa.json
    │   │   ├── internvl_2_finetune_mmqa_rerank.json
    │   │   ├── internvl_2_finetune_mscoco_rerank.json
    │   │   ├── internvl_2_finetune_webqa_qa.json
    │   │   └── internvl_2_finetune_webqa_rerank.json
    │   ├── internvl1.2
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh
    │   │   │   └── internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh
    │   │   └── hermes2_yi34b
    │   │   │   └── internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh
    │   ├── internvl1.5
    │   │   ├── 2nd_finetune
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
    │   │   │   ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
    │   │   │   └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
    │   │   ├── hermes2_yi34b
    │   │   │   ├── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh
    │   │   ├── internlm2_1_8b
    │   │   │   ├── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh
    │   │   ├── internlm2_20b
    │   │   │   ├── internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
    │   │   └── phi3_3_8b
    │   │   │   ├── internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
    │   │   │   └── internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh
    │   └── internvl2.0
    │   │   └── 2nd_finetune
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh
    │   │       ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh
    │   │       ├── internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh
    │   │       ├── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh
    │   │       └── internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh
    ├── zero_stage1_config.json
    ├── zero_stage2_config.json
    ├── zero_stage3_config.json
    ├── zero_stage3_config_100b.json
    ├── zero_stage3_config_34b.json
    └── zero_stage3_config_70b.json
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   ├── table
    │   │   ├── answer
    │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   ├── answer_bard.jsonl
    │   │   │   ├── answer_gpt35.jsonl
    │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   └── answer_vicuna-13b.jsonl
    │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   ├── model.jsonl
    │   │   ├── prompt.jsonl
    │   │   ├── question.jsonl
    │   │   ├── results
    │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   ├── review
    │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   ├── reviewer.jsonl
    │   │   └── rule.json
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── llava_llama.py
    │   │   ├── llava_mistral.py
    │   │   └── llava_mpt.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   └── clip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   └── waterview.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── merge_lora.py
├── mmqa_oracle.py
├── mmqa_pipeline.py
├── mplug_owl2
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── evaluate
    │   ├── EVALUATION.md
    │   ├── __init__.py
    │   ├── evaluate_caption.py
    │   ├── evaluate_mmbench.py
    │   ├── evaluate_mme.py
    │   ├── evaluate_mmmu.py
    │   ├── evaluate_vqa.py
    │   ├── mmbench_converter.py
    │   ├── run_mplug_owl2.py
    │   ├── vqa.py
    │   └── vqa_eval.py
    ├── local_serve
    │   ├── __init__.py
    │   ├── examples
    │   │   ├── Rebecca_(1939_poster)_Small.jpeg
    │   │   └── extreme_ironing.jpg
    │   ├── local_web_server.py
    │   └── model_worker.py
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── configuration_mplug_owl2.py
    │   ├── configuration_qwen.py
    │   ├── convert_mplug_owl2_weight_to_hf.py
    │   ├── modeling_attn_mask_utils.py
    │   ├── modeling_llama2.py
    │   ├── modeling_mplug_owl2.py
    │   ├── modeling_qwen.py
    │   ├── multiway.py
    │   ├── utils.py
    │   └── visual_encoder.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── Rebecca_(1939_poster)_Small.jpeg
    │   │   └── extreme_ironing.jpg
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   └── register_workers.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── mplug_owl2_trainer.py
    │   ├── train.py
    │   └── train_mem.py
    └── utils.py
├── mscoco_pipeline.py
├── qwenvl
    └── run_qwenvl.py
├── requirements.txt
├── utils
    ├── FlagEmbedding
    │   ├── __init__.py
    │   └── visual
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── eva_clip
    │   │       ├── __init__.py
    │   │       ├── bpe_simple_vocab_16e6.txt.gz
    │   │       ├── constants.py
    │   │       ├── eva_vit_model.py
    │   │       ├── factory.py
    │   │       ├── hf_configs.py
    │   │       ├── hf_model.py
    │   │       ├── loss.py
    │   │       ├── model.py
    │   │       ├── model_configs
    │   │       │   ├── EVA01-CLIP-B-16.json
    │   │       │   ├── EVA01-CLIP-g-14-plus.json
    │   │       │   ├── EVA01-CLIP-g-14.json
    │   │       │   ├── EVA02-CLIP-B-16.json
    │   │       │   ├── EVA02-CLIP-L-14-336.json
    │   │       │   ├── EVA02-CLIP-L-14.json
    │   │       │   ├── EVA02-CLIP-bigE-14-plus.json
    │   │       │   └── EVA02-CLIP-bigE-14.json
    │   │       ├── modified_resnet.py
    │   │       ├── openai.py
    │   │       ├── pretrained.py
    │   │       ├── rope.py
    │   │       ├── timm_model.py
    │   │       ├── tokenizer.py
    │   │       ├── transform.py
    │   │       ├── transformer.py
    │   │       └── utils.py
    │   │   └── modeling.py
    ├── __init__.py
    ├── indexing_faiss.py
    ├── metrics.py
    ├── model_series.py
    └── utils.py
├── vcd_utils
    ├── vcd_add_noise.py
    └── vcd_sample.py
├── webqa_oracle.py
└── webqa_pipeline.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__
 3 | *.pyc
 4 | *.egg-info
 5 | dist
 6 | 
 7 | # Log
 8 | *.log
 9 | *.log.*
10 | 
11 | # Data
12 | *.tar.gz
13 | # Other
14 | .DS_Store
15 | wandb
16 | output
17 | checkpoints
18 | datasets
19 | finetune/tasks
20 | ckpts*
21 | 
22 | .ipynb_checkpoints
23 | *.ipynb
24 | 
25 | visualize.py
26 | attention_visualization
27 | *.pth
28 | logs
29 | test_chatgpt.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 IDEA-FinAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RagVL
 2 | This is the official repo for the paper: ["MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training"](https://arxiv.org/pdf/2407.21439).
 3 | 
 4 | ![image](https://github.com/IDEA-FinAI/RagVL/blob/main/assets/framework.png)
 5 | 
 6 | ## Updates
 7 | - [2024-09-20]: To better reflect the generality of our proposed method, we rename it to **RagVL**.
 8 | - [2024-08-05]: Codes of RagVL (RagLLaVA) released.
 9 | - [2024-07-31]: Paper of RagVL (RagLLaVA) online.
10 | 
11 | ## Getting Started
12 | ### Environment Setup
13 | The required libraries for running RagVL can be found in `requirements.txt`. We recommend following [LLaVA](https://github.com/haotian-liu/LLaVA) to configure your environment.
14 | 
15 | ### Data Preparation
16 | Before running RagVL, please:
17 | 
18 | 1. Download from [Google Drive](https://drive.google.com/drive/folders/1wY18Vbrb8yDbFSg1Te-FQIs84AYYh48Z?usp=drive_link) for **datasets** and **checkpoints**. 
19 | 
20 | 2. Download from [WebQA](https://github.com/WebQnA/WebQA) and [MultimodalQA](https://github.com/allenai/multimodalqa) for **image files**.
21 | 
22 | 3. Unzip the file. Place the `checkpoints/` and `datasets/` into `RagVL/`.
23 | 
24 | 4. Place the `tasks/` into `RagVL/finetune/`.
25 | 
26 | 5. Place the `MMQA_imgs/` and `train_img/` into `RagVL/finetune/tasks/`.
27 | 
28 | 6. Place the `val_image/` into `RagVL/datasets/`.
29 | 
30 | ## Training
31 | 1. Reranker
32 | 
33 | | Models | Global Batch Size | Epochs |
34 | | --- | ---: | ---: | 
35 | | LLaVA-v1.5-13B | 16 | 2 (WebQA) / 1 (others) |
36 | | Qwen-VL-Chat | 16 | 2 (WebQA) / 1 (others) |
37 | | mPLUG-Owl2 | 16 | 2 (WebQA) / 1 (others) |
38 | | InternVL2-1B | 16 | 1 |
39 | | InternVL2-2B | 16 | 1 |
40 | 
41 | 2. Generator
42 | 
43 | | Models | Global Batch Size | Epochs |
44 | | --- | ---: | ---: | 
45 | | LLaVA-v1.5-13B | 16 | 2 (WebQA) / 3 (MMQA) |
46 | | InternVL2-1B | 16 | 1 |
47 | | InternVL2-2B | 16 | 1 |
48 | 
49 | Except for the above two hyperparameters, the others follow the default settings from different models.
50 | 
51 | To finetune LLaVA-v1.5-13B, Qwen-VL-Chat, and mPLUG-Owl2, find the corresponding finetune script in `RagVL/finetune/scripts/`.
52 | 
53 | To finetune InternVL2-1B and InternVL2-2B, find the corresponding finetune script in `RagVL/internvl_chat/shell/internvl2.0/2nd_finetune`.
54 | 
55 | ## Evaluation
56 | To evaluate RagVL on WebQA / MultimodalQA, you can employ the following command:
57 | 
58 | ```
59 | python webqa_pipeline.py \  # same arguments on mmqa_pipeline.py
60 | --reranker_model caption_lora \ # select the reranker
61 | --generator_model noise_injected_lora \ # select the generator
62 | --filter 0 \ # select the adaptive threshold
63 | --clip_topk 20 \ # we first retrieve 20 candidates by default
64 | ```
65 | 
66 | To evaluate the oracle settings on WebQA / MultimodalQA, you can employ the following command:
67 | 
68 | ```
69 | python webqa_oracle.py \  # same arguments on mmqa_oracle.py
70 | ```
71 | 
72 | ## Citation
73 | If you are interested or inspired by this work, you can cite us by:
74 | ```sh
75 | @article{chen2024mllm,
76 |   title={MLLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training},
77 |   author={Chen, Zhanpeng and Xu, Chengjin and Qi, Yiyan and Guo, Jian},
78 |   journal={arXiv preprint arXiv:2407.21439},
79 |   year={2024}
80 | }
81 | ```
82 | 
83 | ## Related Projects
84 | - [LLaVA](https://github.com/haotian-liu/LLaVA): Large Language and Vision Assistant
85 | - [Qwen-VL](https://github.com/QwenLM/Qwen-VL): A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond
86 | - [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl): The Powerful Multi-modal Large Language Model Family
87 | - [InternVL](https://github.com/OpenGVLab/InternVL): A Pioneering Open-Source Alternative to GPT-4o
88 | - [Visualized BGE](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/visual): A universal multi-modal embedding model
89 | - [VCD](https://github.com/DAMO-NLP-SG/VCD): Mitigating Object Hallucinations in Large Vision-Language Models through Visual Contrastive Decoding
90 | - [CAL](https://github.com/foundation-multimodal-models/CAL): Prioritizing Visual Correlation by Contrastive Alignment
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/assets/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/assets/framework.png


--------------------------------------------------------------------------------
/finetune/scripts/llava/finetune_task_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed --include localhost:4,5,6,7 ../../../llava/train/train_mem.py \
 4 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 5 |     --deepspeed ./zero3.json \
 6 |     --model_name_or_path liuhaotian/llava-v1.5-13b \
 7 |     --version v1 \
 8 |     --data_path ../../tasks/Flickr30k_one_train_rerank.json \
 9 |     --image_folder ../../tasks \
10 |     --vision_tower openai/clip-vit-large-patch14-336 \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ../../../checkpoints/llava-v1.5-13b-2epoch-16batch_size-flickr30k-one-reranker-caption-lora \
19 |     --num_train_epochs 2 \
20 |     --per_device_train_batch_size 16 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 1 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-4 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4\
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 
39 | 


--------------------------------------------------------------------------------
/finetune/scripts/llava/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/finetune/scripts/llava/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/finetune/scripts/llava/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/finetune/scripts/mplug_owl2/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LOAD='MAGAer13/mplug-owl2-llama2-7b'
 4 | 
 5 | DATA_FILE=../../tasks/WebQA_train_QA_owl.json
 6 | deepspeed --include localhost:4,5,6,7 ../../../mplug_owl2/train/train_mem.py \
 7 |     --lora_enable True --lora_r 128 --lora_alpha 256 --visual_abstractor_lr 2e-5 \
 8 |     --deepspeed ./zero3.json \
 9 |     --model_name_or_path $LOAD \
10 |     --version v1 \
11 |     --data_path $DATA_FILE \
12 |     --image_folder ../../tasks \
13 |     --image_aspect_ratio pad \
14 |     --group_by_modality_length True \
15 |     --bf16 True \
16 |     --output_dir ../../../checkpoints/mplug-owl2-2epoch-8batch_size-webqa-noise-injected-lora \
17 |     --num_train_epochs 2 \
18 |     --per_device_train_batch_size 8 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 10 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-4 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --tune_visual_abstractor True \
34 |     --freeze_vision_model True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb


--------------------------------------------------------------------------------
/finetune/scripts/mplug_owl2/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/finetune/scripts/mplug_owl2/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_param_persistence_threshold": "auto",
23 |         "stage3_max_live_parameters": 0,
24 |         "stage3_max_reuse_distance": 0,
25 |         "stage3_prefetch_bucket_size": 0,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/finetune/scripts/mplug_owl2/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": true,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "none",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "stage3_gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "steps_per_print": 100,
45 |     "train_batch_size": "auto",
46 |     "train_micro_batch_size_per_gpu": "auto",
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen \
30 |     --num_train_epochs 5 \
31 |     --per_device_train_batch_size 1 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 16 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 1000 \
37 |     --save_total_limit 10 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "none" \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --lazy_preprocess True \
48 |     --deepspeed finetune/ds_config_zero3.json
49 | 


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/finetune_lora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=7
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL"  Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="../../tasks/WebQA_train_QA_qwenvl.json"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7
25 | 
26 | torchrun $DISTRIBUTED_ARGS finetune.py \
27 |     --model_name_or_path $MODEL \
28 |     --data_path $DATA \
29 |     --bf16 True \
30 |     --fix_vit True \
31 |     --output_dir ../../../checkpoints/qwen-vl-chat-2epoch-2batch_size-webqa-noise-injected-lora-new \
32 |     --num_train_epochs 2 \
33 |     --per_device_train_batch_size 2 \
34 |     --per_device_eval_batch_size 1 \
35 |     --gradient_accumulation_steps 8 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 1000 \
39 |     --save_total_limit 10 \
40 |     --learning_rate 1e-5 \
41 |     --weight_decay 0.1 \
42 |     --adam_beta2 0.95 \
43 |     --warmup_ratio 0.01 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --report_to "none" \
47 |     --model_max_length 2048 \
48 |     --lazy_preprocess True \
49 |     --use_lora \
50 |     --gradient_checkpointing \
51 |     --deepspeed ds_config_zero2.json


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/finetune_lora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | 
 6 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
 7 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 8 | # See the section for finetuning in README for more information.
 9 | DATA="path_to_data"
10 | 
11 | export CUDA_VISIBLE_DEVICES=0
12 | 
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --bf16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/finetune_qlora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | # Remember to use --fp16 instead of --bf16 due to autogptq
26 | torchrun $DISTRIBUTED_ARGS finetune.py \
27 |     --model_name_or_path $MODEL \
28 |     --data_path $DATA \
29 |     --fp16 True \
30 |     --fix_vit True \
31 |     --output_dir output_qwen \
32 |     --num_train_epochs 5 \
33 |     --per_device_train_batch_size 2 \
34 |     --per_device_eval_batch_size 1 \
35 |     --gradient_accumulation_steps 8 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 1000 \
39 |     --save_total_limit 10 \
40 |     --learning_rate 1e-5 \
41 |     --weight_decay 0.1 \
42 |     --adam_beta2 0.95 \
43 |     --warmup_ratio 0.01 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --report_to "none" \
47 |     --model_max_length 2048 \
48 |     --lazy_preprocess True \
49 |     --use_lora \
50 |     --q_lora \
51 |     --gradient_checkpointing \
52 |     --deepspeed finetune/ds_config_zero2.json


--------------------------------------------------------------------------------
/finetune/scripts/qwenvl/finetune_qlora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 7 | # See the section for finetuning in README for more information.
 8 | DATA="path_to_data"
 9 | 
10 | export CUDA_VISIBLE_DEVICES=0
11 | 
12 | # Remember to use --fp16 instead of --bf16 due to autogptq
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --fp16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora \
38 |     --q_lora \
39 |     --deepspeed finetune/ds_config_zero2.json
40 | 


--------------------------------------------------------------------------------
/internvl_chat/README.md:
--------------------------------------------------------------------------------
 1 | # InternVL-Chat
 2 | 
 3 | This folder contains the implementation of the InternVL-Chat.
 4 | 
 5 | ## 🛠️ Installation
 6 | 
 7 | See [INSTALLATION.md](../INSTALLATION.md)
 8 | 
 9 | In addition, using this codebase requires executing the following steps:
10 | 
11 | - Install other requirements:
12 | 
13 |   ```bash
14 |   pip install --upgrade pip  # enable PEP 660 support
15 |   pip install -e .
16 |   ```
17 | 
18 | ## 📖 Documents
19 | 
20 | - InternVL 2.0
21 | 
22 |   - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/introduction.html)
23 |   - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html)
24 |   - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/finetune.html)
25 |   - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html)
26 |   - Deployment [\[link\]](https://internvl.readthedocs.io/en/latest/internvl2.0/deployment.html)
27 | 
28 | - InternVL 1.5
29 | 
30 |   - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/introduction.html)
31 |   - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/quick_start.html)
32 |   - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/finetune.html)
33 |   - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/evaluation.html)
34 |   - Deployment [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.5/deployment.html)
35 | 
36 | - InternVL 1.2
37 | 
38 |   - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/introduction.html)
39 |   - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/quick_start.html)
40 |   - Reproduce [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/reproduce.html)
41 |   - Finetune [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/finetune.html)
42 |   - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.2/evaluation.html)
43 | 
44 | - InternVL 1.1
45 | 
46 |   - Introduction [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/introduction.html)
47 |   - Quick Start [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/quick_start.html)
48 |   - Evaluation [\[link\]](https://internvl.readthedocs.io/en/latest/internvl1.1/evaluation.html)
49 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/dist_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import socket
  3 | import subprocess
  4 | from datetime import timedelta
  5 | 
  6 | import deepspeed
  7 | import torch
  8 | import torch.multiprocessing as mp
  9 | from torch import distributed as dist
 10 | 
 11 | timeout = timedelta(minutes=60)
 12 | 
 13 | 
 14 | def _find_free_port():
 15 |     # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
 16 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 17 |     # Binding to port 0 will cause the OS to find an available port for us
 18 |     sock.bind(('', 0))
 19 |     port = sock.getsockname()[1]
 20 |     sock.close()
 21 |     # NOTE: there is still a chance the port could be taken by other processes.
 22 |     return port
 23 | 
 24 | 
 25 | def _is_free_port(port):
 26 |     ips = socket.gethostbyname_ex(socket.gethostname())[-1]
 27 |     ips.append('localhost')
 28 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 29 |         return all(s.connect_ex((ip, port)) != 0 for ip in ips)
 30 | 
 31 | 
 32 | def init_dist(launcher, backend='nccl', **kwargs):
 33 |     if mp.get_start_method(allow_none=True) is None:
 34 |         mp.set_start_method('spawn')
 35 |     if launcher == 'pytorch':
 36 |         _init_dist_pytorch(backend, **kwargs)
 37 |     elif launcher == 'mpi':
 38 |         _init_dist_mpi(backend, **kwargs)
 39 |     elif launcher == 'slurm':
 40 |         _init_dist_slurm(backend, **kwargs)
 41 |     else:
 42 |         raise ValueError(f'Invalid launcher type: {launcher}')
 43 | 
 44 | 
 45 | def _init_dist_pytorch(backend, **kwargs):
 46 |     # TODO: use local_rank instead of rank % num_gpus
 47 |     rank = int(os.environ['RANK'])
 48 |     num_gpus = torch.cuda.device_count()
 49 |     torch.cuda.set_device(rank % num_gpus)
 50 |     # dist.init_process_group(backend=backend, **kwargs)
 51 |     deepspeed.init_distributed(dist_backend=backend)
 52 | 
 53 | 
 54 | def _init_dist_mpi(backend, **kwargs):
 55 |     local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
 56 |     torch.cuda.set_device(local_rank)
 57 |     if 'MASTER_PORT' not in os.environ:
 58 |         # 29500 is torch.distributed default port
 59 |         os.environ['MASTER_PORT'] = '29500'
 60 |     if 'MASTER_ADDR' not in os.environ:
 61 |         raise KeyError('The environment variable MASTER_ADDR is not set')
 62 |     os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
 63 |     os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
 64 |     dist.init_process_group(backend=backend, **kwargs)
 65 | 
 66 | 
 67 | def _init_dist_slurm(backend, port=None):
 68 |     """Initialize slurm distributed training environment.
 69 | 
 70 |     If argument ``port`` is not specified, then the master port will be system
 71 |     environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
 72 |     environment variable, then a default port ``29500`` will be used.
 73 | 
 74 |     Args:
 75 |         backend (str): Backend of torch.distributed.
 76 |         port (int, optional): Master port. Defaults to None.
 77 |     """
 78 |     proc_id = int(os.environ['SLURM_PROCID'])
 79 |     ntasks = int(os.environ['SLURM_NTASKS'])
 80 |     node_list = os.environ['SLURM_NODELIST']
 81 |     num_gpus = torch.cuda.device_count()
 82 |     torch.cuda.set_device(proc_id % num_gpus)
 83 |     addr = subprocess.getoutput(
 84 |         f'scontrol show hostname {node_list} | head -n1')
 85 |     # specify master port
 86 |     if port is not None:
 87 |         os.environ['MASTER_PORT'] = str(port)
 88 |     elif 'MASTER_PORT' in os.environ:
 89 |         pass  # use MASTER_PORT in the environment variable
 90 |     else:
 91 |         # if torch.distributed default port(29500) is available
 92 |         # then use it, else find a free port
 93 |         if _is_free_port(29500):
 94 |             os.environ['MASTER_PORT'] = '29500'
 95 |         else:
 96 |             os.environ['MASTER_PORT'] = str(_find_free_port())
 97 |     # use MASTER_ADDR in the environment variable if it already exists
 98 |     if 'MASTER_ADDR' not in os.environ:
 99 |         os.environ['MASTER_ADDR'] = addr
100 |     os.environ['WORLD_SIZE'] = str(ntasks)
101 |     os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
102 |     os.environ['RANK'] = str(proc_id)
103 |     # dist.init_process_group(backend=backend, timeout=timeout)
104 |     deepspeed.init_distributed(dist_backend=backend)
105 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/model/__init__.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from internvl.model.internvl_chat import InternVLChatConfig, InternVLChatModel
 5 | from transformers import AutoTokenizer
 6 | 
 7 | 
 8 | def split_model(num_layers, vit_alpha=0.5):
 9 |     device_map = {}
10 |     world_size = torch.cuda.device_count()
11 |     # Since the first GPU will be used for ViT, treat it as half a GPU.
12 |     num_layers_per_gpu = math.ceil(num_layers / (world_size - vit_alpha))
13 |     num_layers_per_gpu = [num_layers_per_gpu] * world_size
14 |     num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * (1 - vit_alpha))
15 |     layer_cnt = 0
16 |     for i, num_layer in enumerate(num_layers_per_gpu):
17 |         for j in range(num_layer):
18 |             device_map[f"language_model.model.layers.{layer_cnt}"] = i
19 |             layer_cnt += 1
20 |     device_map["vision_model"] = 0
21 |     device_map["mlp1"] = 0
22 |     device_map["language_model.model.tok_embeddings"] = 0
23 |     device_map["language_model.model.embed_tokens"] = 0
24 |     device_map["language_model.output"] = 0
25 |     device_map["language_model.model.norm"] = 0
26 |     device_map["language_model.lm_head"] = 0
27 |     device_map[f"language_model.model.layers.{num_layers - 1}"] = 0
28 | 
29 |     return device_map
30 | 
31 | 
32 | def load_model_and_tokenizer(args):
33 |     if args.auto:
34 |         config = InternVLChatConfig.from_pretrained(args.checkpoint)
35 |         num_hidden_layers = config.llm_config.num_hidden_layers
36 |         device_map = split_model(num_hidden_layers)
37 |     kwargs = {"device_map": device_map} if args.auto else {}
38 |     tokenizer = AutoTokenizer.from_pretrained(
39 |         args.checkpoint, trust_remote_code=True, use_fast=False
40 |     )
41 |     model = InternVLChatModel.from_pretrained(
42 |         args.checkpoint,
43 |         low_cpu_mem_usage=True,
44 |         torch_dtype=torch.bfloat16,
45 |         load_in_8bit=args.load_in_8bit,
46 |         load_in_4bit=args.load_in_4bit,
47 |         **kwargs,
48 |     ).eval()
49 |     if not args.load_in_8bit and not args.load_in_4bit and not args.auto:
50 |         model = model.cuda()
51 |     return model, tokenizer
52 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/model/internvl_chat/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # InternVL
 3 | # Copyright (c) 2024 OpenGVLab
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # --------------------------------------------------------
 6 | 
 7 | from .configuration_intern_vit import InternVisionConfig
 8 | from .configuration_internvl_chat import InternVLChatConfig
 9 | from .modeling_intern_vit import InternVisionModel
10 | from .modeling_internvl_chat import InternVLChatModel
11 | 
12 | __all__ = ['InternVisionConfig', 'InternVisionModel',
13 |            'InternVLChatConfig', 'InternVLChatModel']
14 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/model/internvl_chat/flash_attention.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
 2 | import torch
 3 | import torch.nn as nn
 4 | from einops import rearrange
 5 | 
 6 | try:  # v1
 7 |     from flash_attn.flash_attn_interface import \
 8 |         flash_attn_unpadded_qkvpacked_func
 9 | except:  # v2
10 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
11 | 
12 | from flash_attn.bert_padding import pad_input, unpad_input
13 | 
14 | 
15 | class FlashAttention(nn.Module):
16 |     """Implement the scaled dot product attention with softmax.
17 |     Arguments
18 |     ---------
19 |         softmax_scale: The temperature to use for the softmax attention.
20 |                       (default: 1/sqrt(d_keys) where d_keys is computed at
21 |                       runtime)
22 |         attention_dropout: The dropout rate to apply to the attention
23 |                            (default: 0.0)
24 |     """
25 | 
26 |     def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
27 |         super().__init__()
28 |         self.softmax_scale = softmax_scale
29 |         self.dropout_p = attention_dropout
30 | 
31 |     def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
32 |                 max_s=None, need_weights=False):
33 |         """Implements the multihead softmax attention.
34 |         Arguments
35 |         ---------
36 |             qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
37 |                 if unpadded: (nnz, 3, h, d)
38 |             key_padding_mask: a bool tensor of shape (B, S)
39 |         """
40 |         assert not need_weights
41 |         assert qkv.dtype in [torch.float16, torch.bfloat16]
42 |         assert qkv.is_cuda
43 | 
44 |         if cu_seqlens is None:
45 |             batch_size = qkv.shape[0]
46 |             seqlen = qkv.shape[1]
47 |             if key_padding_mask is None:
48 |                 qkv = rearrange(qkv, 'b s ... -> (b s) ...')
49 |                 max_s = seqlen
50 |                 cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
51 |                                           device=qkv.device)
52 |                 output = flash_attn_unpadded_qkvpacked_func(
53 |                     qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
54 |                     softmax_scale=self.softmax_scale, causal=causal
55 |                 )
56 |                 output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
57 |             else:
58 |                 nheads = qkv.shape[-2]
59 |                 x = rearrange(qkv, 'b s three h d -> b s (three h d)')
60 |                 x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
61 |                 x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
62 |                 output_unpad = flash_attn_unpadded_qkvpacked_func(
63 |                     x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
64 |                     softmax_scale=self.softmax_scale, causal=causal
65 |                 )
66 |                 output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
67 |                                              indices, batch_size, seqlen),
68 |                                    'b s (h d) -> b s h d', h=nheads)
69 |         else:
70 |             assert max_s is not None
71 |             output = flash_attn_unpadded_qkvpacked_func(
72 |                 qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
73 |                 softmax_scale=self.softmax_scale, causal=causal
74 |             )
75 | 
76 |         return output, None
77 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/patch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .llama2_flash_attn_monkey_patch import replace_llama2_attn_with_flash_attn
 2 | from .llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 3 | from .llama_rmsnorm_monkey_patch import \
 4 |     replace_llama_rmsnorm_with_fused_rmsnorm
 5 | from .pad_data_collator import concat_pad_data_collator, pad_data_collator
 6 | from .train_sampler_patch import replace_train_sampler
 7 | 
 8 | __all__ = ['replace_llama_attn_with_flash_attn',
 9 |            'replace_llama_rmsnorm_with_fused_rmsnorm',
10 |            'replace_llama2_attn_with_flash_attn',
11 |            'replace_train_sampler',
12 |            'pad_data_collator',
13 |            'concat_pad_data_collator']
14 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/patch/llama_rmsnorm_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | 
 4 | def replace_llama_rmsnorm_with_fused_rmsnorm():
 5 |     try:
 6 |         from functools import partial
 7 | 
 8 |         from apex.normalization import FusedRMSNorm
 9 |         LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
10 |         transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
11 |         print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
12 |     except ImportError:
13 |         # using the normal LlamaRMSNorm
14 |         pass
15 |     except Exception:
16 |         print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
17 |         pass
18 | 


--------------------------------------------------------------------------------
/internvl_chat/internvl/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/internvl_chat/internvl/train/__init__.py


--------------------------------------------------------------------------------
/internvl_chat/internvl/train/constants.py:
--------------------------------------------------------------------------------
 1 | IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
 2 | IMG_START_TOKEN = '<img>'
 3 | IMG_END_TOKEN = '</img>'
 4 | QUAD_START_TOKEN = '<quad>'
 5 | QUAD_END_TOKEN = '</quad>'
 6 | REF_START_TOKEN = '<ref>'
 7 | REF_END_TOKEN = '</ref>'
 8 | BOX_START_TOKEN = '<box>'
 9 | BOX_END_TOKEN = '</box>'
10 | IMAGENET_MEAN = (0.485, 0.456, 0.406)
11 | IMAGENET_STD = (0.229, 0.224, 0.225)
12 | CLIP_MEAN = (0.4814546, 0.4578275, 0.40821073)
13 | CLIP_STD = (0.2686295, 0.2613025, 0.2757711)
14 | SIGLIP_MEAN = (0.5, 0.5, 0.5)
15 | SIGLIP_STD = (0.5, 0.5, 0.5)
16 | 


--------------------------------------------------------------------------------
/internvl_chat/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "internvl_chat"
 7 | version = "2.0.0"
 8 | description = "Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 | ]
17 | 
18 | [project.urls]
19 | "Homepage" = "https://github.com/OpenGVLab/InternVL"
20 | "Bug Tracker" = "https://github.com/OpenGVLab/InternVL/issues"
21 | 
22 | [tool.setuptools.packages.find]
23 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"]
24 | 
25 | [tool.wheel]
26 | exclude = ["data*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "shell*"]
27 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_flickr30k_rerank.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_rerank": {
3 |         "root": "RagVL/finetune/tasks",
4 |         "annotation": "RagVL/finetune/tasks/Flickr30k_one_train_rerank_clip_negatives_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 58000
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_mmqa_qa.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_qa": {
3 |         "root": "RagVL/finetune/tasks",
4 |         "annotation": "RagVL/finetune/tasks/MMQA_train_QA_single_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 2099
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_mmqa_rerank.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_rerank": {
3 |         "root": "RagVL/finetune/tasks",
4 |         "annotation": "RagVL/finetune/tasks/MMQA_train_rerank_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 19432
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_mscoco_rerank.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_rerank": {
3 |         "root": "RagVL/finetune/tasks",
4 |         "annotation": "RagVL/finetune/tasks/MSCOCO_one_train_rerank_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 40000
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_webqa_qa.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_qa": {
3 |         "root": "finetune/tasks",
4 |         "annotation": "finetune/tasks/WebQA_train_QA_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 15163
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/data/internvl_2_finetune_webqa_rerank.json:
--------------------------------------------------------------------------------
1 | {
2 |     "webqa_rerank": {
3 |         "root": "RagVL/finetune/tasks",
4 |         "annotation": "RagVL/finetune/tasks/WebQA_train_rerank_internvl.jsonl",
5 |         "data_augment": false,
6 |         "repeat_time": 1,
7 |         "length": 32990
8 |     }
9 | }


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-16}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-128}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 16
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 2
28 | # total batch size: 128
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./pretrained/InternVL-Chat-V1-2-Plus" \
41 |   --conv_style "Hermes-2" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 1 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.0 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone True \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 1 \
62 |   --learning_rate 1e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 2048 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size False \
72 |   --use_thumbnail False \
73 |   --ps_version 'v1' \
74 |   --deepspeed "zero_stage3_config_34b.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL-Chat-V1-2-Plus" \
33 |   --conv_style "Hermes-2" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 1 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 1e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 2048 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size False \
65 |   --use_thumbnail False \
66 |   --ps_version 'v1' \
67 |   --deepspeed "zero_stage3_config_34b.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.2/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-64}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-512}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 64
26 | # batch size per gpu: 8
27 | # gradient accumulation steps: 1
28 | # total batch size: 512
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --vision_path "./pretrained/InternViT-6B-448px-V1-2" \
41 |   --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \
42 |   --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \
43 |   --conv_style "Hermes-2" \
44 |   --output_dir ${OUTPUT_DIR} \
45 |   --meta_path "./shell/data/internvl_1_2_finetune.json" \
46 |   --overwrite_output_dir True \
47 |   --force_image_size 448 \
48 |   --down_sample_ratio 0.5 \
49 |   --drop_path_rate 0.4 \
50 |   --freeze_llm False \
51 |   --freeze_mlp False \
52 |   --freeze_backbone False \
53 |   --vision_select_layer -1 \
54 |   --dataloader_num_workers 4 \
55 |   --bf16 True \
56 |   --num_train_epochs 1 \
57 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
58 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
59 |   --evaluation_strategy "no" \
60 |   --save_strategy "steps" \
61 |   --save_steps 200 \
62 |   --save_total_limit 3 \
63 |   --learning_rate 1e-5 \
64 |   --weight_decay 0.05 \
65 |   --warmup_ratio 0.03 \
66 |   --lr_scheduler_type "cosine" \
67 |   --logging_steps 1 \
68 |   --max_seq_length 2048 \
69 |   --do_train True \
70 |   --grad_checkpoint True \
71 |   --group_by_length True \
72 |   --deepspeed "zero_stage3_config_34b.json" \
73 |   --report_to "tensorboard" \
74 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
75 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.01 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 4e-5 \
56 |   --weight_decay 0.01 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage1_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.4 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 2e-5 \
55 |   --weight_decay 0.05 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage3_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 2e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage3_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \
33 |   --conv_style "phi3-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.05 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \
33 |   --conv_style "phi3-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 12 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 4e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage1_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-256}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-1024}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 256
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 1
28 | # total batch size: 1024
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./work_dirs/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain" \
41 |   --conv_style "Hermes-2" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "path/to/finetune/data.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 12 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.4 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone False \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 3 \
62 |   --learning_rate 2e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 4096 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage3_config_34b.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-256}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-2048}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 256
26 | # batch size per gpu: 2
27 | # gradient accumulation steps: 4
28 | # total batch size: 2048
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_pretrain.py \
40 |   --vision_path "./pretrained/InternViT-6B-448px-V1-5" \
41 |   --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \
42 |   --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \
43 |   --conv_style "Hermes-2" \
44 |   --output_dir ${OUTPUT_DIR} \
45 |   --meta_path "path/to/pretrain/data.json" \
46 |   --overwrite_output_dir True \
47 |   --force_image_size 448 \
48 |   --max_dynamic_patch 12 \
49 |   --down_sample_ratio 0.5 \
50 |   --drop_path_rate 0.0 \
51 |   --freeze_llm True \
52 |   --freeze_mlp False \
53 |   --freeze_backbone True \
54 |   --vision_select_layer -1 \
55 |   --dataloader_num_workers 4 \
56 |   --bf16 True \
57 |   --num_train_epochs 1 \
58 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
59 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
60 |   --evaluation_strategy "no" \
61 |   --save_strategy "steps" \
62 |   --save_steps 200 \
63 |   --save_total_limit 3 \
64 |   --learning_rate 1e-4 \
65 |   --weight_decay 0.05 \
66 |   --warmup_steps 100 \
67 |   --lr_scheduler_type "cosine" \
68 |   --logging_steps 1 \
69 |   --max_seq_length 4096 \
70 |   --do_train True \
71 |   --grad_checkpoint True \
72 |   --group_by_length False \
73 |   --dynamic_image_size True \
74 |   --use_thumbnail True \
75 |   --ps_version 'v2' \
76 |   --deepspeed "zero_stage3_config_34b.json" \
77 |   --report_to "tensorboard" \
78 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
79 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-128}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-1024}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 128
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 2
28 | # total batch size: 1024
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain" \
41 |   --conv_style "internlm2-chat" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "path/to/finetune/data.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 12 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.1 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone False \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 3 \
62 |   --learning_rate 4e-5 \
63 |   --weight_decay 0.01 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 8192 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage1_config.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-128}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-2048}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 128
26 | # batch size per gpu: 8
27 | # gradient accumulation steps: 2
28 | # total batch size: 2048
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_pretrain.py \
40 |   --vision_path "./pretrained/InternViT-300M-448px" \
41 |   --llm_path "./pretrained/internlm2-chat-1_8b" \
42 |   --conv_style "internlm2-chat" \
43 |   --output_dir ${OUTPUT_DIR} \
44 |   --meta_path "path/to/pretrain/data.json" \
45 |   --overwrite_output_dir True \
46 |   --force_image_size 448 \
47 |   --max_dynamic_patch 12 \
48 |   --down_sample_ratio 0.5 \
49 |   --drop_path_rate 0.1 \
50 |   --freeze_llm True \
51 |   --freeze_mlp False \
52 |   --freeze_backbone False \
53 |   --vision_select_layer -1 \
54 |   --dataloader_num_workers 4 \
55 |   --bf16 True \
56 |   --num_train_epochs 1 \
57 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
58 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
59 |   --evaluation_strategy "no" \
60 |   --save_strategy "steps" \
61 |   --save_steps 200 \
62 |   --save_total_limit 3 \
63 |   --learning_rate 2e-5 \
64 |   --weight_decay 0.01 \
65 |   --warmup_steps 100 \
66 |   --lr_scheduler_type "cosine" \
67 |   --logging_steps 1 \
68 |   --max_seq_length 4096 \
69 |   --do_train True \
70 |   --grad_checkpoint True \
71 |   --group_by_length False \
72 |   --dynamic_image_size True \
73 |   --use_thumbnail True \
74 |   --ps_version 'v2' \
75 |   --deepspeed "zero_stage1_config.json" \
76 |   --report_to "tensorboard" \
77 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
78 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-256}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-1024}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 256
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 1
28 | # total batch size: 1024
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain" \
41 |   --conv_style "internlm2-chat" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "path/to/finetune/data.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 12 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.4 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone False \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 3 \
62 |   --learning_rate 2e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 4096 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage3_config.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-256}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-2048}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 256
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 2
28 | # total batch size: 2048
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_pretrain.py \
40 |   --vision_path "./pretrained/InternViT-6B-448px-V1-5" \
41 |   --llm_path "./pretrained/internlm2-chat-20b" \
42 |   --conv_style "internlm2-chat" \
43 |   --output_dir ${OUTPUT_DIR} \
44 |   --meta_path "path/to/pretrain/data.json" \
45 |   --overwrite_output_dir True \
46 |   --force_image_size 448 \
47 |   --max_dynamic_patch 12 \
48 |   --down_sample_ratio 0.5 \
49 |   --drop_path_rate 0.2 \
50 |   --freeze_llm True \
51 |   --freeze_mlp False \
52 |   --freeze_backbone False \
53 |   --vision_select_layer -1 \
54 |   --dataloader_num_workers 4 \
55 |   --bf16 True \
56 |   --num_train_epochs 1 \
57 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
58 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
59 |   --evaluation_strategy "no" \
60 |   --save_strategy "steps" \
61 |   --save_steps 200 \
62 |   --save_total_limit 3 \
63 |   --learning_rate 1e-5 \
64 |   --weight_decay 0.05 \
65 |   --warmup_steps 100 \
66 |   --lr_scheduler_type "cosine" \
67 |   --logging_steps 1 \
68 |   --max_seq_length 4096 \
69 |   --do_train True \
70 |   --grad_checkpoint True \
71 |   --group_by_length False \
72 |   --dynamic_image_size True \
73 |   --use_thumbnail True \
74 |   --ps_version 'v2' \
75 |   --deepspeed "zero_stage3_config.json" \
76 |   --report_to "tensorboard" \
77 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
78 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-128}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-1024}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 128
26 | # batch size per gpu: 4
27 | # gradient accumulation steps: 2
28 | # total batch size: 1024
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./work_dirs/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune" \
41 |   --conv_style "phi3-chat" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "path/to/finetune/data.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 12 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.1 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone False \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 3 \
62 |   --learning_rate 4e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 8192 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage1_config.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-128}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-2048}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 128
26 | # batch size per gpu: 8
27 | # gradient accumulation steps: 2
28 | # total batch size: 2048
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_pretrain.py \
40 |   --vision_path "./pretrained/InternViT-300M-448px" \
41 |   --llm_path "./pretrained/Phi-3-mini-128k-instruct" \
42 |   --conv_style "phi3-chat" \
43 |   --output_dir ${OUTPUT_DIR} \
44 |   --meta_path "path/to/pretrain/data.json" \
45 |   --overwrite_output_dir True \
46 |   --force_image_size 448 \
47 |   --max_dynamic_patch 12 \
48 |   --down_sample_ratio 0.5 \
49 |   --drop_path_rate 0.0 \
50 |   --freeze_llm True \
51 |   --freeze_mlp False \
52 |   --freeze_backbone True \
53 |   --vision_select_layer -1 \
54 |   --dataloader_num_workers 4 \
55 |   --bf16 True \
56 |   --num_train_epochs 1 \
57 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
58 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
59 |   --evaluation_strategy "no" \
60 |   --save_strategy "steps" \
61 |   --save_steps 200 \
62 |   --save_total_limit 3 \
63 |   --learning_rate 2e-4 \
64 |   --weight_decay 0.05 \
65 |   --warmup_steps 100 \
66 |   --lr_scheduler_type "cosine" \
67 |   --logging_steps 1 \
68 |   --max_seq_length 4096 \
69 |   --do_train True \
70 |   --grad_checkpoint True \
71 |   --group_by_length False \
72 |   --dynamic_image_size True \
73 |   --use_thumbnail True \
74 |   --ps_version 'v2' \
75 |   --deepspeed "zero_stage1_config.json" \
76 |   --report_to "tensorboard" \
77 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
78 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-1B" \
33 |   --conv_style "Hermes-2" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.01 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-4}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | export CUDA_VISIBLE_DEVICES=0,1,2,3
14 | 
15 | OUTPUT_DIR='RagVL/checkpoints/internvl2_1b_1epoch-16batch_size-mmqa-noise-injected-lora'
16 | 
17 | if [ ! -d "$OUTPUT_DIR" ]; then
18 |   mkdir -p "$OUTPUT_DIR"
19 | fi
20 | 
21 | # number of gpus: 4
22 | # batch size per gpu: 4
23 | # gradient accumulation steps: 1
24 | # total batch size: 16
25 | # epoch: 1
26 | torchrun \
27 |   --nnodes=1 \
28 |   --node_rank=0 \
29 |   --master_addr=127.0.0.1 \
30 |   --nproc_per_node=${GPUS} \
31 |   --master_port=${MASTER_PORT} \
32 |   RagVL/internvl_chat/internvl/train/internvl_chat_finetune.py \
33 |   --model_name_or_path "OpenGVLab/InternVL2-1B" \
34 |   --conv_style "Hermes-2" \
35 |   --output_dir ${OUTPUT_DIR} \
36 |   --meta_path "RagVL/internvl_chat/shell/data/internvl_2_finetune_mmqa_qa.json" \
37 |   --overwrite_output_dir True \
38 |   --force_image_size 448 \
39 |   --max_dynamic_patch 6 \
40 |   --down_sample_ratio 0.5 \
41 |   --drop_path_rate 0.0 \
42 |   --freeze_llm True \
43 |   --freeze_mlp True \
44 |   --freeze_backbone True \
45 |   --use_llm_lora 16 \
46 |   --vision_select_layer -1 \
47 |   --dataloader_num_workers 4 \
48 |   --bf16 True \
49 |   --num_train_epochs 1 \
50 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
51 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
52 |   --evaluation_strategy "no" \
53 |   --save_strategy "steps" \
54 |   --save_steps 200 \
55 |   --save_total_limit 1 \
56 |   --learning_rate 4e-5 \
57 |   --weight_decay 0.01 \
58 |   --warmup_ratio 0.03 \
59 |   --lr_scheduler_type "cosine" \
60 |   --logging_steps 1 \
61 |   --max_seq_length 4096 \
62 |   --do_train True \
63 |   --grad_checkpoint True \
64 |   --group_by_length True \
65 |   --dynamic_image_size True \
66 |   --use_thumbnail True \
67 |   --ps_version 'v2' \
68 |   --deepspeed "RagVL/internvl_chat/zero_stage1_config.json" \
69 |   --report_to "tensorboard" \
70 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
71 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 2
22 | # gradient accumulation steps: 8
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-26B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.4 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 2e-5 \
55 |   --weight_decay 0.05 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage3_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_26b_internlm2_20b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 2
22 | # gradient accumulation steps: 4
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-26B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 2e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage3_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-2B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.01 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-4}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | export CUDA_VISIBLE_DEVICES=0,1,2,3
14 | 
15 | OUTPUT_DIR='RagVL/checkpoints/internvl2_2b_1epoch-16batch_size-flickr30k-one-reranker-caption-clip-negatives-lora'
16 | 
17 | if [ ! -d "$OUTPUT_DIR" ]; then
18 |   mkdir -p "$OUTPUT_DIR"
19 | fi
20 | 
21 | # number of gpus: 4
22 | # batch size per gpu: 4
23 | # gradient accumulation steps: 1
24 | # total batch size: 16
25 | # epoch: 1
26 | torchrun \
27 |   --nnodes=1 \
28 |   --node_rank=0 \
29 |   --master_addr=127.0.0.1 \
30 |   --nproc_per_node=${GPUS} \
31 |   --master_port=${MASTER_PORT} \
32 |   RagVL/internvl_chat/internvl/train/internvl_chat_finetune.py \
33 |   --model_name_or_path "OpenGVLab/InternVL2-2B" \
34 |   --conv_style "internlm2-chat" \
35 |   --output_dir ${OUTPUT_DIR} \
36 |   --meta_path "RagVL/internvl_chat/shell/data/internvl_2_finetune_flickr30k_rerank.json" \
37 |   --overwrite_output_dir True \
38 |   --force_image_size 448 \
39 |   --max_dynamic_patch 6 \
40 |   --down_sample_ratio 0.5 \
41 |   --drop_path_rate 0.0 \
42 |   --freeze_llm True \
43 |   --freeze_mlp True \
44 |   --freeze_backbone True \
45 |   --use_llm_lora 16 \
46 |   --vision_select_layer -1 \
47 |   --dataloader_num_workers 4 \
48 |   --bf16 True \
49 |   --num_train_epochs 1 \
50 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
51 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
52 |   --evaluation_strategy "no" \
53 |   --save_strategy "steps" \
54 |   --save_steps 200 \
55 |   --save_total_limit 1 \
56 |   --learning_rate 4e-5 \
57 |   --weight_decay 0.01 \
58 |   --warmup_ratio 0.03 \
59 |   --lr_scheduler_type "cosine" \
60 |   --logging_steps 1 \
61 |   --max_seq_length 4096 \
62 |   --do_train True \
63 |   --grad_checkpoint True \
64 |   --group_by_length True \
65 |   --dynamic_image_size True \
66 |   --use_thumbnail True \
67 |   --ps_version 'v2' \
68 |   --deepspeed "RagVL/internvl_chat/zero_stage1_config.json" \
69 |   --report_to "tensorboard" \
70 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
71 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-512}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 16
23 | # total batch size: 512
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-2B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/coco_caption.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 128 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 4e-5 \
56 |   --weight_decay 0.01 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage1_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-16}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-128}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_full'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 16
26 | # batch size per gpu: 2
27 | # gradient accumulation steps: 4
28 | # total batch size: 128
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./pretrained/InternVL2-40B" \
41 |   --conv_style "Hermes-2" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 6 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.4 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone True \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 1 \
62 |   --learning_rate 2e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 4096 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage3_config_34b.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_40b_hermes2_yi_34b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 2
22 | # gradient accumulation steps: 4
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-40B" \
33 |   --conv_style "Hermes-2" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 2e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage3_config_34b.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-4B" \
33 |   --conv_style "phi3-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.05 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-4B" \
33 |   --conv_style "phi3-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 4e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage1_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | PARTITION=${PARTITION:-"INTERN2"}
 4 | GPUS=${GPUS:-32}
 5 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
 6 | QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
 7 | NODES=$((GPUS / GPUS_PER_NODE))
 8 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
 9 | SRUN_ARGS=${SRUN_ARGS:-""}
10 | BATCH_SIZE=${BATCH_SIZE:-128}
11 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
12 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
13 | 
14 | 
15 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
16 | export MASTER_PORT=34229
17 | export TF_CPP_MIN_LOG_LEVEL=3
18 | 
19 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full'
20 | 
21 | if [ ! -d "$OUTPUT_DIR" ]; then
22 |   mkdir -p "$OUTPUT_DIR"
23 | fi
24 | 
25 | # number of gpus: 32
26 | # batch size per gpu: 1
27 | # gradient accumulation steps: 4
28 | # total batch size: 128
29 | # epoch: 1
30 | srun -p ${PARTITION} \
31 |   --gres=gpu:${GPUS_PER_NODE} \
32 |   --nodes=${NODES} \
33 |   --ntasks=${GPUS} \
34 |   --ntasks-per-node=${GPUS_PER_NODE} \
35 |   --cpus-per-task=${CPUS_PER_TASK} \
36 |   --kill-on-bad-exit=1 \
37 |   --quotatype=${QUOTA_TYPE} \
38 |   ${SRUN_ARGS} \
39 |   python -u internvl/train/internvl_chat_finetune.py \
40 |   --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \
41 |   --conv_style "internlm2-chat" \
42 |   --output_dir ${OUTPUT_DIR} \
43 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
44 |   --overwrite_output_dir True \
45 |   --force_image_size 448 \
46 |   --max_dynamic_patch 6 \
47 |   --down_sample_ratio 0.5 \
48 |   --drop_path_rate 0.4 \
49 |   --freeze_llm False \
50 |   --freeze_mlp False \
51 |   --freeze_backbone True \
52 |   --vision_select_layer -1 \
53 |   --dataloader_num_workers 4 \
54 |   --bf16 True \
55 |   --num_train_epochs 1 \
56 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
57 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
58 |   --evaluation_strategy "no" \
59 |   --save_strategy "steps" \
60 |   --save_steps 200 \
61 |   --save_total_limit 1 \
62 |   --learning_rate 2e-5 \
63 |   --weight_decay 0.05 \
64 |   --warmup_ratio 0.03 \
65 |   --lr_scheduler_type "cosine" \
66 |   --logging_steps 1 \
67 |   --max_seq_length 4096 \
68 |   --do_train True \
69 |   --grad_checkpoint True \
70 |   --group_by_length True \
71 |   --dynamic_image_size True \
72 |   --use_thumbnail True \
73 |   --ps_version 'v2' \
74 |   --deepspeed "zero_stage3_config_100b.json" \
75 |   --report_to "tensorboard" \
76 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
77 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 1
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 2e-5 \
56 |   --weight_decay 0.05 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage3_config_100b.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-8}
 4 | BATCH_SIZE=${BATCH_SIZE:-128}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 8
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 4
23 | # total batch size: 128
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-8B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.1 \
41 |   --freeze_llm False \
42 |   --freeze_mlp False \
43 |   --freeze_backbone True \
44 |   --vision_select_layer -1 \
45 |   --dataloader_num_workers 4 \
46 |   --bf16 True \
47 |   --num_train_epochs 1 \
48 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
49 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
50 |   --evaluation_strategy "no" \
51 |   --save_strategy "steps" \
52 |   --save_steps 200 \
53 |   --save_total_limit 1 \
54 |   --learning_rate 4e-5 \
55 |   --weight_decay 0.01 \
56 |   --warmup_ratio 0.03 \
57 |   --lr_scheduler_type "cosine" \
58 |   --logging_steps 1 \
59 |   --max_seq_length 4096 \
60 |   --do_train True \
61 |   --grad_checkpoint True \
62 |   --group_by_length True \
63 |   --dynamic_image_size True \
64 |   --use_thumbnail True \
65 |   --ps_version 'v2' \
66 |   --deepspeed "zero_stage1_config.json" \
67 |   --report_to "tensorboard" \
68 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
69 | 


--------------------------------------------------------------------------------
/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | GPUS=${GPUS:-2}
 4 | BATCH_SIZE=${BATCH_SIZE:-16}
 5 | PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
 6 | GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
 7 | 
 8 | 
 9 | export PYTHONPATH="${PYTHONPATH}:$(pwd)"
10 | export MASTER_PORT=34229
11 | export TF_CPP_MIN_LOG_LEVEL=3
12 | export LAUNCHER=pytorch
13 | 
14 | OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora'
15 | 
16 | if [ ! -d "$OUTPUT_DIR" ]; then
17 |   mkdir -p "$OUTPUT_DIR"
18 | fi
19 | 
20 | # number of gpus: 2
21 | # batch size per gpu: 4
22 | # gradient accumulation steps: 2
23 | # total batch size: 16
24 | # epoch: 1
25 | torchrun \
26 |   --nnodes=1 \
27 |   --node_rank=0 \
28 |   --master_addr=127.0.0.1 \
29 |   --nproc_per_node=${GPUS} \
30 |   --master_port=${MASTER_PORT} \
31 |   internvl/train/internvl_chat_finetune.py \
32 |   --model_name_or_path "./pretrained/InternVL2-8B" \
33 |   --conv_style "internlm2-chat" \
34 |   --output_dir ${OUTPUT_DIR} \
35 |   --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
36 |   --overwrite_output_dir True \
37 |   --force_image_size 448 \
38 |   --max_dynamic_patch 6 \
39 |   --down_sample_ratio 0.5 \
40 |   --drop_path_rate 0.0 \
41 |   --freeze_llm True \
42 |   --freeze_mlp True \
43 |   --freeze_backbone True \
44 |   --use_llm_lora 16 \
45 |   --vision_select_layer -1 \
46 |   --dataloader_num_workers 4 \
47 |   --bf16 True \
48 |   --num_train_epochs 1 \
49 |   --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
50 |   --gradient_accumulation_steps ${GRADIENT_ACC} \
51 |   --evaluation_strategy "no" \
52 |   --save_strategy "steps" \
53 |   --save_steps 200 \
54 |   --save_total_limit 1 \
55 |   --learning_rate 4e-5 \
56 |   --weight_decay 0.01 \
57 |   --warmup_ratio 0.03 \
58 |   --lr_scheduler_type "cosine" \
59 |   --logging_steps 1 \
60 |   --max_seq_length 4096 \
61 |   --do_train True \
62 |   --grad_checkpoint True \
63 |   --group_by_length True \
64 |   --dynamic_image_size True \
65 |   --use_thumbnail True \
66 |   --ps_version 'v2' \
67 |   --deepspeed "zero_stage1_config.json" \
68 |   --report_to "tensorboard" \
69 |   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
70 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e9,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e9,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": true
41 | }
42 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 2,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 1e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": 1e8,
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "optimizer": {
24 |     "type": "AdamW",
25 |     "params": {
26 |       "lr": "auto",
27 |       "betas": [
28 |         0.9,
29 |         0.999
30 |       ],
31 |       "eps": 1e-8,
32 |       "weight_decay": "auto"
33 |     }
34 |   },
35 |   "gradient_accumulation_steps": "auto",
36 |   "gradient_clipping": "auto",
37 |   "steps_per_print": 2000,
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "wall_clock_breakdown": false
41 | }
42 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e7,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_100b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e4,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_34b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e5,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/internvl_chat/zero_stage3_config_70b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 3,
 4 |     "overlap_comm": true,
 5 |     "contiguous_gradients": true,
 6 |     "sub_group_size": 1e9,
 7 |     "reduce_bucket_size": 1e9,
 8 |     "stage3_prefetch_bucket_size": 1e9,
 9 |     "stage3_param_persistence_threshold": 1e5,
10 |     "stage3_max_live_parameters": 1e9,
11 |     "stage3_max_reuse_distance": 1e9,
12 |     "stage3_gather_16bit_weights_on_model_save": true
13 |   },
14 |   "fp16": {
15 |     "enabled": "auto",
16 |     "auto_cast": true,
17 |     "loss_scale": 0,
18 |     "initial_scale_power": 32,
19 |     "loss_scale_window": 1000,
20 |     "hysteresis": 2,
21 |     "min_loss_scale": 1
22 |   },
23 |   "bf16": {
24 |     "enabled": "auto"
25 |   },
26 |   "optimizer": {
27 |     "type": "AdamW",
28 |     "params": {
29 |       "lr": "auto",
30 |       "betas": [
31 |         0.9,
32 |         0.999
33 |       ],
34 |       "eps": 1e-8,
35 |       "weight_decay": "auto"
36 |     }
37 |   },
38 |   "gradient_accumulation_steps": "auto",
39 |   "gradient_clipping": "auto",
40 |   "steps_per_print": 2000,
41 |   "train_batch_size": "auto",
42 |   "train_micro_batch_size_per_gpu": "auto",
43 |   "wall_clock_breakdown": true
44 | }
45 | 


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model.language_model.llava_llama import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/llava/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/llava/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/llava/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/llava/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
 3 | import torch
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | import shortuuid
 8 | 
 9 | from llava.conversation import default_conversation
10 | from llava.utils import disable_torch_init
11 | 
12 | 
13 | @torch.inference_mode()
14 | def eval_model(model_name, questions_file, answers_file):
15 |     # Model
16 |     disable_torch_init()
17 |     model_name = os.path.expanduser(model_name)
18 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19 |     model = AutoModelForCausalLM.from_pretrained(model_name,
20 |         torch_dtype=torch.float16).cuda()
21 | 
22 | 
23 |     ques_file = open(os.path.expanduser(questions_file), "r")
24 |     ans_file = open(os.path.expanduser(answers_file), "w")
25 |     for i, line in enumerate(tqdm(ques_file)):
26 |         idx = json.loads(line)["question_id"]
27 |         qs = json.loads(line)["text"]
28 |         cat = json.loads(line)["category"]
29 |         conv = default_conversation.copy()
30 |         conv.append_message(conv.roles[0], qs)
31 |         prompt = conv.get_prompt()
32 |         inputs = tokenizer([prompt])
33 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
34 |         output_ids = model.generate(
35 |             input_ids,
36 |             do_sample=True,
37 |             use_cache=True,
38 |             temperature=0.7,
39 |             max_new_tokens=1024,)
40 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41 |         try:
42 |             index = outputs.index(conv.sep, len(prompt))
43 |         except ValueError:
44 |             outputs += conv.sep
45 |             index = outputs.index(conv.sep, len(prompt))
46 | 
47 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48 |         ans_id = shortuuid.uuid()
49 |         ans_file.write(json.dumps({"question_id": idx,
50 |                                    "text": outputs,
51 |                                    "answer_id": ans_id,
52 |                                    "model_id": model_name,
53 |                                    "metadata": {}}) + "\n")
54 |         ans_file.flush()
55 |     ans_file.close()
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62 |     args = parser.parse_args()
63 | 
64 |     eval_model(args.model_name, args.question_file, args.answers_file)
65 | 


--------------------------------------------------------------------------------
/llava/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | import concurrent.futures
 8 | 
 9 | import openai
10 | import tqdm
11 | import shortuuid
12 | 
13 | MODEL = 'gpt-3.5-turbo'
14 | MODEL_ID = 'gpt-3.5-turbo:20230327'
15 | 
16 | def get_answer(question_id: int, question: str, max_tokens: int):
17 |     ans = {
18 |         'answer_id': shortuuid.uuid(),
19 |         'question_id': question_id,
20 |         'model_id': MODEL_ID,
21 |     }
22 |     for _ in range(3):
23 |         try:
24 |             response = openai.ChatCompletion.create(
25 |                 model=MODEL,
26 |                 messages=[{
27 |                     'role': 'system',
28 |                     'content': 'You are a helpful assistant.'
29 |                 }, {
30 |                     'role': 'user',
31 |                     'content': question,
32 |                 }],
33 |                 max_tokens=max_tokens,
34 |             )
35 |             ans['text'] = response['choices'][0]['message']['content']
36 |             return ans
37 |         except Exception as e:
38 |             print('[ERROR]', e)
39 |             ans['text'] = '#ERROR#'
40 |             time.sleep(1)
41 |     return ans
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46 |     parser.add_argument('-q', '--question')
47 |     parser.add_argument('-o', '--output')
48 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49 |     args = parser.parse_args()
50 | 
51 |     questions_dict = {}
52 |     with open(os.path.expanduser(args.question)) as f:
53 |         for line in f:
54 |             if not line:
55 |                 continue
56 |             q = json.loads(line)
57 |             questions_dict[q['question_id']] = q['text']
58 | 
59 |     answers = []
60 | 
61 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62 |         futures = []
63 |         for qid, question in questions_dict.items():
64 |             future = executor.submit(get_answer, qid, question, args.max_tokens)
65 |             futures.append(future)
66 | 
67 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68 |             answers.append(future.result())
69 | 
70 |     answers.sort(key=lambda x: x['question_id'])
71 | 
72 |     with open(os.path.expanduser(args.output), 'w') as f:
73 |         table = [json.dumps(ans) for ans in answers]
74 |         f.write('\n'.join(table))
75 | 


--------------------------------------------------------------------------------
/llava/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/llava/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/llava/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/llava/eval/webpage/styles.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  3 |     background-color: #f8f9fa;
  4 | }
  5 | 
  6 | .navbar-dark .navbar-nav .nav-link {
  7 |     color: #f1cf68;
  8 |     font-size: 1.1rem;
  9 |     padding: 0.5rem 0.6rem;
 10 | }
 11 | 
 12 | .card-header {
 13 |     font-weight: bold;
 14 | }
 15 | 
 16 | .card {
 17 |     box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 18 |     transition: 0.3s;
 19 | }
 20 | 
 21 | .card:hover {
 22 |     box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
 23 | }
 24 | 
 25 | button {
 26 |     transition: background-color 0.3s;
 27 | }
 28 | 
 29 | button:hover {
 30 |     background-color: #007bff;
 31 | }
 32 | 
 33 | @media (max-width: 767px) {
 34 |     .form-row .form-group {
 35 |         margin-bottom: 10px;
 36 |     }
 37 | }
 38 | 
 39 | /* Extra styles */
 40 | 
 41 | .expandable-card .card-text-container {
 42 |     max-height: 200px;
 43 |     overflow-y: hidden;
 44 |     position: relative;
 45 | }
 46 | 
 47 | .expandable-card.expanded .card-text-container {
 48 |     max-height: none;
 49 | }
 50 | 
 51 | .expand-btn {
 52 |     position: relative;
 53 |     display: none;
 54 |     background-color: rgba(255, 255, 255, 0.8);
 55 |     color: #510c75;
 56 |     border-color: transparent;
 57 | }
 58 | 
 59 | .expand-btn:hover {
 60 |     background-color: rgba(200, 200, 200, 0.8);
 61 |     text-decoration: none;
 62 |     border-color: transparent;
 63 |     color: #510c75;
 64 | }
 65 | 
 66 | .expand-btn:focus {
 67 |     outline: none;
 68 |     text-decoration: none;
 69 | }
 70 | 
 71 | .expandable-card:not(.expanded) .card-text-container:after {
 72 |     content: "";
 73 |     position: absolute;
 74 |     bottom: 0;
 75 |     left: 0;
 76 |     width: 100%;
 77 |     height: 90px;
 78 |     background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
 79 | }
 80 | 
 81 | .expandable-card:not(.expanded) .expand-btn {
 82 |     margin-top: -40px;
 83 | }
 84 | 
 85 | .card-body {
 86 |     padding-bottom: 5px;
 87 | }
 88 | 
 89 | .vertical-flex-layout {
 90 |     justify-content: center;
 91 |     align-items: center;
 92 |     height: 100%;
 93 |     display: flex;
 94 |     flex-direction: column;
 95 |     gap: 5px;
 96 | }
 97 | 
 98 | .figure-img {
 99 |     max-width: 100%;
100 |     height: auto;
101 | }
102 | 
103 | .adjustable-font-size {
104 |     font-size: calc(0.5rem + 2vw);
105 | }
106 | 


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 |     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 |     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | except:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2023 Haotian Liu
 2 | #
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | #
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 
16 | from typing import Optional, Tuple
17 | 
18 | import torch
19 | 
20 | from transformers import AutoConfig, AutoModelForCausalLM, \
21 |                          MptConfig, MptForCausalLM, MptModel
22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23 | 
24 | 
25 | class LlavaMptConfig(MptConfig):
26 |     model_type = "llava_mpt"
27 | 
28 | 
29 | class LlavaMptModel(LlavaMetaModel, MptModel):
30 |     config_class = LlavaMptConfig
31 | 
32 |     def __init__(self, config: MptConfig):
33 |         config.hidden_size = config.d_model
34 |         super(LlavaMptModel, self).__init__(config)
35 |     
36 |     def embed_tokens(self, x):
37 |         return self.wte(x)
38 | 
39 | 
40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41 |     config_class = LlavaMptConfig
42 |     supports_gradient_checkpointing = True
43 | 
44 |     def __init__(self, config):
45 |         super(MptForCausalLM, self).__init__(config)
46 | 
47 |         self.transformer = LlavaMptModel(config)
48 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49 | 
50 |         # Initialize weights and apply final processing
51 |         self.post_init()
52 | 
53 |     def get_model(self):
54 |         return self.transformer
55 | 
56 |     def _set_gradient_checkpointing(self, module, value=False):
57 |         if isinstance(module, LlavaMptModel):
58 |             module.gradient_checkpointing = value
59 | 
60 |     def forward(
61 |         self,
62 |         input_ids: Optional[torch.LongTensor] = None,
63 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64 |         attention_mask: Optional[torch.Tensor] = None,
65 |         inputs_embeds: Optional[torch.Tensor] = None,
66 |         labels: Optional[torch.Tensor] = None,
67 |         use_cache: Optional[bool] = None,
68 |         output_attentions: Optional[bool] = None,
69 |         output_hidden_states: Optional[bool] = None,
70 |         return_dict: Optional[bool] = None,
71 |         images=None):
72 | 
73 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74 |         
75 |         return super().forward(
76 |             input_ids,
77 |             past_key_values=past_key_values,
78 |             attention_mask=attention_mask,
79 |             inputs_embeds=inputs_embeds,
80 |             labels=labels,
81 |             use_cache=use_cache,
82 |             output_attentions=output_attentions,
83 |             output_hidden_states=output_hidden_states,
84 |             return_dict=return_dict,
85 |         )
86 | 
87 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88 |         images = kwargs.pop("images", None)
89 |         _inputs = super().prepare_inputs_for_generation(
90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91 |         )
92 |         _inputs['images'] = images
93 |         return _inputs
94 | 
95 | 
96 | AutoConfig.register("llava_mpt", LlavaMptConfig)
97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
98 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 | 
15 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
16 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/__init__.py


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 |     # train()
6 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/merge_lora.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from internvl_chat.internvl.model.internvl_chat import InternVLChatModel
 3 | from transformers import AutoTokenizer
 4 | 
 5 | input_path = (
 6 |     "checkpoints/web/internvl2_2b_1epoch-16batch_size-webqa-reranker-caption-lora"
 7 | )
 8 | output_path = (
 9 |     "checkpoints/web/internvl2_2b_1epoch-16batch_size-webqa-reranker-caption-lora-merge"
10 | )
11 | 
12 | print("Loading model...")
13 | model = InternVLChatModel.from_pretrained(
14 |     input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
15 | ).eval()
16 | print("Loading tokenizer...")
17 | tokenizer = AutoTokenizer.from_pretrained(input_path, trust_remote_code=True)
18 | 
19 | if model.config.use_backbone_lora:
20 |     model.vision_model.merge_and_unload()
21 |     model.vision_model = model.vision_model.model
22 |     model.config.use_backbone_lora = 0
23 | if model.config.use_llm_lora:
24 |     model.language_model.merge_and_unload()
25 |     model.language_model = model.language_model.model
26 |     model.config.use_llm_lora = 0
27 | 
28 | print("Saving model...")
29 | model.save_pretrained(output_path)
30 | print("Saving tokenizer...")
31 | tokenizer.save_pretrained(output_path)
32 | print("Done!")
33 | 


--------------------------------------------------------------------------------
/mmqa_oracle.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import ipdb
 3 | import json
 4 | from tqdm import tqdm
 5 | import numpy as np
 6 | 
 7 | from utils.metrics import mmqa_metrics_approx
 8 | from utils.model_series import load_generator
 9 | from utils.utils import infer
10 | import argparse
11 | 
12 | 
13 | ############### CLIP + Rerank ###############
14 | def baseline_generate(
15 |     val_dataset,
16 |     generator_path,
17 |     tokenizer,
18 |     image_processor,
19 |     generator_model,
20 | ):
21 |     acc_scores = {"ALL": []}
22 | 
23 |     with open("datasets/MMQA_ImageQ_metadata.json", "r") as f:
24 |         metadata = json.load(f)
25 | 
26 |     for datum in tqdm(val_dataset):
27 |         qid = datum["qid"]
28 |         question = datum["question"]
29 |         answer = datum["answers"][0]["answer"]
30 |         pos_imgs = datum["supporting_context"]
31 | 
32 |         pos_source = []
33 | 
34 |         for item in pos_imgs:
35 |             pos_source.append(item["doc_id"])
36 | 
37 |         IMAGE_PATH = ""
38 |         for i in range(len(pos_source)):
39 |             IMAGE_PATH += "finetune/tasks/MMQA_imgs/" + metadata[pos_source[i]]["path"]
40 |             if i != len(pos_source) - 1:
41 |                 IMAGE_PATH += ","
42 | 
43 |         output = infer(
44 |             generator_path,
45 |             IMAGE_PATH,
46 |             question,
47 |             generator_model,
48 |             tokenizer,
49 |             image_processor,
50 |             from_array=False,
51 |         )
52 | 
53 |         if "how many" in question.lower():
54 |             qcate = "number"
55 |         else:
56 |             qcate = "normal"
57 | 
58 |         accuracy = mmqa_metrics_approx(output, answer, qcate)
59 |         acc_scores["ALL"].append(accuracy)
60 | 
61 |     print("Generation ACC:", np.mean(acc_scores["ALL"]))
62 | 
63 | 
64 | def main():
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--datasets", type=str, default="test")
67 |     parser.add_argument("--generator_model", type=str, default="noise_injected_lora")
68 |     parser.add_argument("--series", type=str, default="llava")
69 |     args = parser.parse_args()
70 |     print(args)
71 | 
72 |     (tokenizer, generator_model, image_processor), generator_path = load_generator(
73 |         args, "mmqa"
74 |     )
75 | 
76 |     if args.datasets == "test":
77 |         with open("datasets/MMQA_test_image.json", "r") as f:
78 |             val_dataset = json.load(f)
79 | 
80 |     elif args.datasets == "dev":
81 |         with open("datasets/MMQA_test_image.json", "r") as f:
82 |             val_dataset = json.load(f)
83 | 
84 |     with torch.no_grad():
85 |         baseline_generate(
86 |             val_dataset,
87 |             generator_path,
88 |             tokenizer,
89 |             image_processor,
90 |             generator_model,
91 |         )
92 | 
93 |     print(args)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/mplug_owl2/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import MPLUGOwl2LlamaForCausalLM


--------------------------------------------------------------------------------
/mplug_owl2/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "./demo_logs"
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<|image|>"
10 | 


--------------------------------------------------------------------------------
/mplug_owl2/evaluate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/evaluate/__init__.py


--------------------------------------------------------------------------------
/mplug_owl2/evaluate/mmbench_converter.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import base64
 4 | import json
 5 | from PIL import Image
 6 | 
 7 | '''
 8 | This scripts convert mmbench_dev tsv file to jsonl
 9 | '''
10 | 
11 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
12 | 
13 | global_choices = ['A', 'B', 'C', 'D']
14 | 
15 | def decode_base64_to_image(base64_string):
16 |     image_data = base64.b64decode(base64_string)
17 |     image = Image.open(io.BytesIO(image_data))
18 |     return image
19 | 
20 | 
21 | with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
22 |     for idx in range(len(datas)):
23 |         data = datas.iloc[idx]
24 |         
25 |         index = int(data['index'])
26 |         question = data['question']
27 |         hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
28 | 
29 |         choices = []
30 |         for opt in global_choices:
31 |             if pd.isna(data[opt]):
32 |                 continue
33 |             choices.append(data[opt])
34 | 
35 |         answer = global_choices.index(data['answer'])
36 | 
37 |         image = decode_base64_to_image(data['image'])
38 |         image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
39 | 
40 |         f.write(json.dumps({
41 |             "index": index,
42 |             "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
43 |             "hint": hint,
44 |             "question": question,
45 |             "choices": choices, 
46 |             "answer": answer,
47 |         }) + "\n")


--------------------------------------------------------------------------------
/mplug_owl2/local_serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/__init__.py


--------------------------------------------------------------------------------
/mplug_owl2/local_serve/examples/Rebecca_(1939_poster)_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/examples/Rebecca_(1939_poster)_Small.jpeg


--------------------------------------------------------------------------------
/mplug_owl2/local_serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/local_serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/mplug_owl2/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling_mplug_owl2 import MPLUGOwl2LlamaForCausalLM, MPLUGOwl2QWenForCausalLM
2 | from .configuration_mplug_owl2 import MPLUGOwl2Config, MPLUGOwl2QwenConfig
3 | 


--------------------------------------------------------------------------------
/mplug_owl2/model/configuration_qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba Cloud.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from transformers import PretrainedConfig
 7 | 
 8 | 
 9 | class QWenConfig(PretrainedConfig):
10 |     model_type = "qwen"
11 |     keys_to_ignore_at_inference = ["past_key_values"]
12 | 
13 |     def __init__(
14 |         self,
15 |         multiway=False,
16 |         vocab_size=151936,
17 |         hidden_size=4096,
18 |         num_hidden_layers=32,
19 |         num_attention_heads=32,
20 |         emb_dropout_prob=0.0,
21 |         attn_dropout_prob=0.0,
22 |         layer_norm_epsilon=1e-6,
23 |         initializer_range=0.02,
24 |         max_position_embeddings=8192,
25 |         scale_attn_weights=True,
26 |         use_cache=True,
27 |         bf16=False,
28 |         fp16=False,
29 |         fp32=False,
30 |         kv_channels=128,
31 |         rotary_pct=1.0,
32 |         rotary_emb_base=10000,
33 |         use_dynamic_ntk=True,
34 |         use_logn_attn=True,
35 |         use_flash_attn="auto",
36 |         intermediate_size=22016,
37 |         no_bias=True,
38 |         tie_word_embeddings=False,
39 |         use_cache_quantization=False,
40 |         use_cache_kernel=False,
41 |         softmax_in_fp32=False,
42 |         **kwargs,
43 |     ):
44 |         self.multiway = multiway
45 |         self.vocab_size = vocab_size
46 |         self.hidden_size = hidden_size
47 |         self.intermediate_size = intermediate_size
48 |         self.num_hidden_layers = num_hidden_layers
49 |         self.num_attention_heads = num_attention_heads
50 |         self.emb_dropout_prob = emb_dropout_prob
51 |         self.attn_dropout_prob = attn_dropout_prob
52 |         self.layer_norm_epsilon = layer_norm_epsilon
53 |         self.initializer_range = initializer_range
54 |         self.scale_attn_weights = scale_attn_weights
55 |         self.use_cache = use_cache
56 |         self.max_position_embeddings = max_position_embeddings
57 |         self.bf16 = bf16
58 |         self.fp16 = fp16
59 |         self.fp32 = fp32
60 |         self.kv_channels = kv_channels
61 |         self.rotary_pct = rotary_pct
62 |         self.rotary_emb_base = rotary_emb_base
63 |         self.use_dynamic_ntk = use_dynamic_ntk
64 |         self.use_logn_attn = use_logn_attn
65 |         self.use_flash_attn = use_flash_attn
66 |         self.no_bias = no_bias
67 |         self.use_cache_quantization = use_cache_quantization
68 |         self.use_cache_kernel = use_cache_kernel
69 |         self.softmax_in_fp32 = softmax_in_fp32
70 |         super().__init__(
71 |             tie_word_embeddings=tie_word_embeddings,
72 |             **kwargs
73 |         )


--------------------------------------------------------------------------------
/mplug_owl2/model/multiway.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | import torch.utils.checkpoint
 4 | from torch import nn
 5 | 
 6 | 
 7 | class MultiwayNetwork(nn.Module):
 8 | 
 9 |     def __init__(self, module_provider, num_multiway=2, out_features=None):
10 |         super(MultiwayNetwork, self).__init__()
11 | 
12 |         self.multiway = torch.nn.ModuleList([module_provider() for _ in range(num_multiway)])
13 |         self.out_features=out_features
14 |     def forward(self, hidden_states, multiway_indices):
15 | 
16 |         if len(self.multiway) == 1:
17 |             return self.multiway[0](hidden_states)
18 |         if self.out_features:
19 |             output_hidden_states = torch.empty(
20 |                 hidden_states.size(0), hidden_states.size(1), self.out_features,
21 |                 dtype=hidden_states.dtype
22 |             ).to(hidden_states.device)
23 |         else:
24 |             output_hidden_states = torch.empty_like(hidden_states)
25 |         for idx, subway in enumerate(self.multiway):
26 |             local_indices = multiway_indices.eq(idx).nonzero(as_tuple=True)
27 |             hidden = hidden_states[local_indices].unsqueeze(1).contiguous()
28 |             if hidden.numel():
29 |                 output = subway(hidden)
30 |                 if isinstance(output, tuple):
31 |                     output = output[0]
32 |                 output = output.squeeze(1)
33 |                 output_hidden_states[local_indices] = output
34 |         
35 |         return output_hidden_states.contiguous()


--------------------------------------------------------------------------------
/mplug_owl2/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'mplug_owl2' in config and 'mplug_owl2' not in cfg.model_type:
 7 |         assert cfg.model_type == 'mplug_owl2'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "mplug_owl2")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)


--------------------------------------------------------------------------------
/mplug_owl2/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/__init__.py


--------------------------------------------------------------------------------
/mplug_owl2/serve/examples/Rebecca_(1939_poster)_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/examples/Rebecca_(1939_poster)_Small.jpeg


--------------------------------------------------------------------------------
/mplug_owl2/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/mplug_owl2/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/mplug_owl2/serve/register_workers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200


--------------------------------------------------------------------------------
/mplug_owl2/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from mplug_owl2.train.llama_flash_attn_monkey_patch import (
 7 |     replace_llama_attn_with_flash_attn,
 8 | )
 9 | 
10 | replace_llama_attn_with_flash_attn()
11 | from mplug_owl2.train.train import train
12 | 
13 | if __name__ == "__main__":
14 |     train()
15 | 


--------------------------------------------------------------------------------
/qwenvl/run_qwenvl.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | from transformers.generation import GenerationConfig
 3 | from peft import AutoPeftModelForCausalLM
 4 | import torch
 5 | 
 6 | 
 7 | def qwen_eval_relevance(image_path, question, model, tokenizer):
 8 | 
 9 |     query_list = [{"image": image_path}]
10 | 
11 |     query_list.append({"text": question})
12 | 
13 |     query = tokenizer.from_list_format(query_list)
14 |     outputs = model.chat(
15 |         tokenizer,
16 |         query=query,
17 |         history=None,
18 |         return_dict_in_generate=True,
19 |         output_scores=True,
20 |         do_sample=False,
21 |     )
22 | 
23 |     logits = outputs.scores[0][0]
24 | 
25 |     probs = (
26 |         torch.nn.functional.softmax(
27 |             torch.FloatTensor(
28 |                 [
29 |                     logits[tokenizer("Yes").input_ids[0]],
30 |                     logits[tokenizer("No").input_ids[0]],
31 |                 ]
32 |             ),
33 |             dim=0,
34 |         )
35 |         .detach()
36 |         .cpu()
37 |         .numpy()
38 |     )
39 | 
40 |     return probs[0]
41 | 
42 | 
43 | def qwen_chat(image_path, question, model, tokenizer):
44 | 
45 |     query_list = []
46 |     if image_path:
47 |         for img in image_path.split(","):
48 |             query_list.append({"image": img})
49 | 
50 |     query_list.append({"text": question})
51 | 
52 |     query = tokenizer.from_list_format(query_list)
53 |     response, _ = model.chat(tokenizer, query=query, history=None, do_sample=False)
54 | 
55 |     return response
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     model_path = "Qwen/Qwen-VL-Chat"
60 |     adapter_path = (
61 |         "../checkpoints/qwen-vl-chat-2epoch-4batch_size-webqa-reranker-caption-lora"
62 |     )
63 | 
64 |     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
65 | 
66 |     mm_model = AutoPeftModelForCausalLM.from_pretrained(
67 |         adapter_path,  # path to the output directory
68 |         device_map="auto",
69 |         trust_remote_code=True,
70 |     ).eval()
71 | 
72 |     image_path = "../assets/framework.png"
73 |     query = "Image Caption: Centennial Olympic Park splash fountain\nQuestion:\"Are there more than 6 tall lamp posts surrounding the fountain at Centennial Park?\"\nBased on the image and its caption, is the image relevant to the question? Answer 'Yes' or 'No'."
74 |     # ans = qwen_chat(image_path, query, mm_model, tokenizer)
75 |     ans = qwen_eval_relevance(image_path, query, mm_model, tokenizer)
76 |     print(ans)
77 | 


--------------------------------------------------------------------------------
/utils/FlagEmbedding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/FlagEmbedding/__init__.py


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling import Visualized_BGE


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_eva_vision_and_transforms
 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 4 | from .loss import ClipLoss
 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\
 6 |     convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 7 | from .openai import load_openai_model, list_openai_models
 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
 9 |     get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
10 | from .tokenizer import SimpleTokenizer, tokenize
11 | from .transform import image_transform


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/FlagEmbedding/visual/eva_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |   # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |   "roberta": {
 5 |       "config_names": {
 6 |           "context_length": "max_position_embeddings",
 7 |           "vocab_size": "vocab_size",
 8 |           "width": "hidden_size",
 9 |           "heads": "num_attention_heads",
10 |           "layers": "num_hidden_layers",
11 |           "layer_attr": "layer",
12 |           "token_embeddings_attr": "embeddings"
13 |       },
14 |       "pooler": "mean_pooler",
15 |   },
16 |   # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |   "xlm-roberta": {
18 |       "config_names": {
19 |           "context_length": "max_position_embeddings",
20 |           "vocab_size": "vocab_size",
21 |           "width": "hidden_size",
22 |           "heads": "num_attention_heads",
23 |           "layers": "num_hidden_layers",
24 |           "layer_attr": "layer",
25 |           "token_embeddings_attr": "embeddings"
26 |       },
27 |       "pooler": "mean_pooler",
28 |   },
29 |   # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |   "mt5": {
31 |       "config_names": {
32 |           # unlimited seqlen
33 |           # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |           # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |           "context_length": "",
36 |           "vocab_size": "vocab_size",
37 |           "width": "d_model",
38 |           "heads": "num_heads",
39 |           "layers": "num_layers",
40 |           "layer_attr": "block",
41 |           "token_embeddings_attr": "embed_tokens"
42 |       },
43 |       "pooler": "mean_pooler",
44 |   },
45 |   "bert": {
46 |     "config_names": {
47 |       "context_length": "max_position_embeddings",
48 |       "vocab_size": "vocab_size",
49 |       "width": "hidden_size",
50 |       "heads": "num_attention_heads",
51 |       "layers": "num_hidden_layers",
52 |       "layer_attr": "layer",
53 |       "token_embeddings_attr": "embeddings"
54 |     },
55 |     "pooler": "mean_pooler",
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true,
19 |         "patch_dropout": 0.5
20 |     },
21 |     "text_cfg": {
22 |         "context_length": 77,
23 |         "vocab_size": 49408,
24 |         "width": 512,
25 |         "heads": 8,
26 |         "layers": 12,
27 |         "xattn": true,
28 |         "fusedLN": true
29 |     }
30 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/utils/FlagEmbedding/visual/eva_clip/transform.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision.transforms.functional as F
  6 | 
  7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
  8 |     CenterCrop
  9 | 
 10 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 11 | 
 12 | 
 13 | class ResizeMaxSize(nn.Module):
 14 | 
 15 |     def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
 16 |         super().__init__()
 17 |         if not isinstance(max_size, int):
 18 |             raise TypeError(f"Size should be int. Got {type(max_size)}")
 19 |         self.max_size = max_size
 20 |         self.interpolation = interpolation
 21 |         self.fn = min if fn == 'min' else min
 22 |         self.fill = fill
 23 | 
 24 |     def forward(self, img):
 25 |         if isinstance(img, torch.Tensor):
 26 |             height, width = img.shape[:2]
 27 |         else:
 28 |             width, height = img.size
 29 |         scale = self.max_size / float(max(height, width))
 30 |         if scale != 1.0:
 31 |             new_size = tuple(round(dim * scale) for dim in (height, width))
 32 |             img = F.resize(img, new_size, self.interpolation)
 33 |             pad_h = self.max_size - new_size[0]
 34 |             pad_w = self.max_size - new_size[1]
 35 |             img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
 36 |         return img
 37 | 
 38 | 
 39 | def _convert_to_rgb(image):
 40 |     return image.convert('RGB')
 41 | 
 42 | 
 43 | # class CatGen(nn.Module):
 44 | #     def __init__(self, num=4):
 45 | #         self.num = num
 46 | #     def mixgen_batch(image, text):
 47 | #         batch_size = image.shape[0]
 48 | #         index = np.random.permutation(batch_size)
 49 | 
 50 | #         cat_images = []
 51 | #         for i in range(batch_size):
 52 | #             # image mixup
 53 | #             image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
 54 | #             # text concat
 55 | #             text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
 56 | #         text = torch.stack(text)
 57 | #         return image, text
 58 | 
 59 | 
 60 | def image_transform(
 61 |         image_size: int,
 62 |         is_train: bool,
 63 |         mean: Optional[Tuple[float, ...]] = None,
 64 |         std: Optional[Tuple[float, ...]] = None,
 65 |         resize_longest_max: bool = False,
 66 |         fill_color: int = 0,
 67 | ):
 68 |     mean = mean or OPENAI_DATASET_MEAN
 69 |     if not isinstance(mean, (list, tuple)):
 70 |         mean = (mean,) * 3
 71 | 
 72 |     std = std or OPENAI_DATASET_STD
 73 |     if not isinstance(std, (list, tuple)):
 74 |         std = (std,) * 3
 75 | 
 76 |     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
 77 |         # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
 78 |         image_size = image_size[0]
 79 | 
 80 |     normalize = Normalize(mean=mean, std=std)
 81 |     if is_train:
 82 |         return Compose([
 83 |             RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
 84 |             _convert_to_rgb,
 85 |             ToTensor(),
 86 |             normalize,
 87 |         ])
 88 |     else:
 89 |         if resize_longest_max:
 90 |             transforms = [
 91 |                 ResizeMaxSize(image_size, fill=fill_color)
 92 |             ]
 93 |         else:
 94 |             transforms = [
 95 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
 96 |                 CenterCrop(image_size),
 97 |             ]
 98 |         transforms.extend([
 99 |             _convert_to_rgb,
100 |             ToTensor(),
101 |             normalize,
102 |         ])
103 |         return Compose(transforms)
104 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEA-FinAI/RagVL/d31de677fa6ba391e0a47f5d7f98532fbdc80d5a/utils/__init__.py


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | from llava.mm_utils import get_model_name_from_path
 2 | from llava.eval.run_llava import llava_chat, llava_eval_relevance
 3 | from mplug_owl2.evaluate.run_mplug_owl2 import owl_chat, owl_eval_relevance
 4 | from qwenvl.run_qwenvl import qwen_chat, qwen_eval_relevance
 5 | from internvl_chat.eval.run_internvl import (
 6 |     internvl_chat,
 7 |     internvl_eval_relevance,
 8 | )
 9 | 
10 | 
11 | def cal_relevance(model_path, image_path, question, model, tokenizer, image_processor):
12 | 
13 |     if "qwen-vl" in model_path.lower():
14 |         prob = qwen_eval_relevance(image_path, question, model, tokenizer)
15 |     else:
16 |         args = type(
17 |             "Args",
18 |             (),
19 |             {
20 |                 "model_path": model_path,
21 |                 "model_base": None,
22 |                 "model_name": get_model_name_from_path(model_path),
23 |                 "query": question,
24 |                 "conv_mode": None,
25 |                 "image_file": image_path,
26 |                 "sep": ",",
27 |                 "temperature": 0,
28 |                 "top_p": None,
29 |                 "num_beams": 1,
30 |                 "max_new_tokens": 512,
31 |             },
32 |         )()
33 | 
34 |         if "llava" in model_path:
35 |             prob = llava_eval_relevance(args, tokenizer, model, image_processor)
36 |         elif "mplug-owl2" in model_path:
37 |             prob = owl_eval_relevance(args, tokenizer, model, image_processor)
38 |         elif "internvl" in model_path.lower():
39 |             prob = internvl_eval_relevance(args, tokenizer, model)
40 | 
41 |     return prob
42 | 
43 | 
44 | def infer(
45 |     model_path,
46 |     image_file,
47 |     question,
48 |     model,
49 |     tokenizer,
50 |     image_processor,
51 |     from_array=False,
52 | ):
53 |     if "webqa" in model_path:
54 |         prompt_template = question
55 |     else:
56 |         prompt_template = (
57 |             f"{question}\nAnswer the question using a single word or phrase."
58 |         )
59 | 
60 |     if "qwen-vl" in model_path.lower():
61 |         output = qwen_chat(image_file, prompt_template, model, tokenizer)
62 |     else:
63 |         args = type(
64 |             "Args",
65 |             (),
66 |             {
67 |                 "model_path": model_path,
68 |                 "model_base": None,
69 |                 "model_name": get_model_name_from_path(model_path),
70 |                 "query": prompt_template,
71 |                 "conv_mode": None,
72 |                 "image_file": image_file,
73 |                 "sep": ",",
74 |                 "temperature": 0,
75 |                 "top_p": None,
76 |                 "num_beams": 1,
77 |                 "max_new_tokens": 512,
78 |             },
79 |         )()
80 | 
81 |         if "llava" in model_path:
82 |             output = llava_chat(
83 |                 args,
84 |                 tokenizer,
85 |                 model,
86 |                 image_processor,
87 |                 from_array=from_array,
88 |             )
89 |         elif "mplug-owl2" in model_path:
90 |             output = owl_chat(args, tokenizer, model, image_processor)
91 |         elif "internvl" in model_path.lower():
92 |             output = internvl_chat(args, tokenizer, model)
93 | 
94 |     return output
95 | 


--------------------------------------------------------------------------------
/vcd_utils/vcd_add_noise.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def add_diffusion_noise(image_tensor, noise_step):
 5 |     num_steps = 1000  # Number of diffusion steps
 6 | 
 7 |     # decide beta in each step
 8 |     betas = torch.linspace(-6, 6, num_steps)
 9 |     betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5
10 | 
11 |     # decide alphas in each step
12 |     alphas = 1 - betas
13 |     alphas_prod = torch.cumprod(alphas, dim=0)
14 |     alphas_prod_p = torch.cat(
15 |         [torch.tensor([1]).float(), alphas_prod[:-1]], 0
16 |     )  # p for previous
17 |     alphas_bar_sqrt = torch.sqrt(alphas_prod)
18 |     one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
19 |     one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
20 | 
21 |     def q_x(x_0, t):
22 |         noise = torch.randn_like(x_0)
23 |         alphas_t = alphas_bar_sqrt[t]
24 |         alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
25 |         return alphas_t * x_0 + alphas_1_m_t * noise
26 | 
27 |     noise_delta = int(noise_step)  # from 0-999
28 |     noisy_image = image_tensor.clone()
29 |     image_tensor_cd = q_x(noisy_image, noise_step)
30 | 
31 |     return image_tensor_cd
32 | 


--------------------------------------------------------------------------------