├── llava
    ├── serve
    │   ├── __init__.py
    │   ├── examples
    │   │   ├── waterview.jpg
    │   │   └── extreme_ironing.jpg
    │   ├── register_worker.py
    │   └── test_message.py
    ├── __init__.py
    ├── __pycache__
    │   ├── utils.cpython-310.pyc
    │   ├── __init__.cpython-310.pyc
    │   ├── constants.cpython-310.pyc
    │   ├── mm_utils.cpython-310.pyc
    │   └── conversation.cpython-310.pyc
    ├── train
    │   ├── __pycache__
    │   │   ├── train.cpython-310.pyc
    │   │   ├── llava_trainer.cpython-310.pyc
    │   │   └── rag_handler.cpython-310.pyc
    │   ├── train_mem.py
    │   └── llava_trainer_eval.py
    ├── model
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   └── llava_arch.cpython-310.pyc
    │   ├── language_model
    │   │   ├── __pycache__
    │   │   │   ├── llava_qwen.cpython-310.pyc
    │   │   │   ├── llava_llama.cpython-310.pyc
    │   │   │   ├── llava_mistral.cpython-310.pyc
    │   │   │   └── llava_mixtral.cpython-310.pyc
    │   │   └── llava_mpt.py
    │   ├── multimodal_encoder
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   ├── hf_vision.cpython-310.pyc
    │   │   │   ├── imagebind.cpython-310.pyc
    │   │   │   ├── bmclip_encoder.cpython-310.pyc
    │   │   │   ├── clip_encoder.cpython-310.pyc
    │   │   │   ├── siglip_encoder.cpython-310.pyc
    │   │   │   └── open_clip_encoder.cpython-310.pyc
    │   │   ├── dev_eva_clip
    │   │   │   └── eva_clip
    │   │   │   │   ├── constants.py
    │   │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   │   ├── model_configs
    │   │   │   │       ├── EVA01-CLIP-B-16.json
    │   │   │   │       ├── EVA01-CLIP-g-14.json
    │   │   │   │       ├── EVA01-CLIP-g-14-plus.json
    │   │   │   │       ├── EVA02-CLIP-bigE-14.json
    │   │   │   │       ├── Internal-EVA02-CLIP-10B-14.json
    │   │   │   │       ├── EVA02-CLIP-bigE-14-plus.json
    │   │   │   │       ├── Internal-EVA02-CLIP-10B-14-448.json
    │   │   │   │       ├── EVA-CLIP-18B.json
    │   │   │   │       ├── EVA-CLIP-8B.json
    │   │   │   │       ├── EVA-CLIP-8B-plus.json
    │   │   │   │       ├── EVA02-CLIP-L-14.json
    │   │   │   │       ├── EVA02-CLIP-B-16.json
    │   │   │   │       └── EVA02-CLIP-L-14-336.json
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── hf_configs.py
    │   │   │   │   └── transform.py
    │   │   ├── eva_clip
    │   │   │   ├── model_configs
    │   │   │   │   ├── EVA01-CLIP-B-16.json
    │   │   │   │   ├── EVA01-CLIP-g-14.json
    │   │   │   │   ├── EVA01-CLIP-g-14-plus.json
    │   │   │   │   ├── EVA02-CLIP-bigE-14.json
    │   │   │   │   ├── Internal-EVA02-CLIP-10B-14.json
    │   │   │   │   ├── EVA02-CLIP-bigE-14-plus.json
    │   │   │   │   ├── Internal-EVA02-CLIP-10B-14-448.json
    │   │   │   │   ├── EVA-CLIP-8B.json
    │   │   │   │   ├── EVA-CLIP-18B.json
    │   │   │   │   ├── EVA-CLIP-8B-plus.json
    │   │   │   │   ├── EVA02-CLIP-B-16.json
    │   │   │   │   ├── EVA02-CLIP-L-14.json
    │   │   │   │   └── EVA02-CLIP-L-14-336.json
    │   │   │   ├── factory.py
    │   │   │   ├── eva_clip_processors.py
    │   │   │   └── eva_clip_encoder.py
    │   │   ├── builder.py
    │   │   └── imagebind.py
    │   ├── multimodal_projector
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   └── pooler_projector.cpython-310.pyc
    │   │   ├── pooler_projector.py
    │   │   └── builder.py
    │   ├── multimodal_resampler
    │   │   ├── __pycache__
    │   │   │   ├── builder.cpython-310.pyc
    │   │   │   ├── qformer.cpython-310.pyc
    │   │   │   ├── perceiver.cpython-310.pyc
    │   │   │   ├── masked_drop.cpython-310.pyc
    │   │   │   └── spatial_pool.cpython-310.pyc
    │   │   ├── builder.py
    │   │   ├── spatial_pool.py
    │   │   └── masked_drop.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── consolidate.py
    │   ├── apply_delta.py
    │   └── make_delta.py
    └── constants.py
├── images
    ├── vis.png
    ├── teasor.png
    └── architecture.png
├── scripts
    ├── archived
    │   ├── interleave
    │   │   ├── eval_all.sh
    │   │   ├── eval_interleave_3d.sh
    │   │   └── eval_multiprocess.sh
    │   ├── archived_prev
    │   │   ├── convert_mmvet_for_eval.py
    │   │   ├── convert_gqa_for_eval.py
    │   │   ├── sqa_eval_batch.sh
    │   │   ├── sqa_eval_gather.sh
    │   │   ├── merge_lora_weights.py
    │   │   ├── finetune_sqa.sh
    │   │   ├── entry_cmd.sh
    │   │   ├── pretrain.sh
    │   │   ├── convert_vizwiz_for_submission.py
    │   │   ├── finetune_full_schedule.sh
    │   │   ├── finetune_lora.sh
    │   │   ├── finetune_qlora.sh
    │   │   ├── convert_vqav2_for_submission.py
    │   │   ├── quick_check.py
    │   │   ├── finetune_mixtral_1.6_336px_anyres_lmms_eval.sh
    │   │   ├── dpo_data_info.py
    │   │   ├── finetune_mixtral_1.5.sh
    │   │   ├── finetune.sh
    │   │   ├── finetune_mixtral.sh
    │   │   ├── finetune_mixtral_copy.sh
    │   │   ├── convert_sqa_to_llava.py
    │   │   ├── finetune_1.5.sh
    │   │   ├── finetune_mixtral_1.6_336px_anyres_freeze_vision.sh
    │   │   └── finetune_mixtral_1.6_336px_anyres.sh
    │   ├── video
    │   │   ├── eval
    │   │   │   ├── video_detail_description_eval_only.sh
    │   │   │   ├── activitynet_eval.sh
    │   │   │   ├── video_detail_description_eval_shard.sh
    │   │   │   └── video_description_from_t2v.sh
    │   │   ├── demo
    │   │   │   └── video_demo.sh
    │   │   └── train
    │   │   │   ├── SO400M_Qwen2_72B_ov_to_video_am9.sh
    │   │   │   └── SO400M_Qwen2_7B_ov_to_video_am9.sh
    │   ├── qwen.py
    │   ├── train
    │   │   ├── mid_stage.yaml
    │   │   ├── dpo_ov7b.sh
    │   │   ├── direct_finetune_clip.sh
    │   │   ├── direct_finetune_siglip_a4.sh
    │   │   ├── dpo.sh
    │   │   ├── pt_clip2.sh
    │   │   ├── finetune_si.sh
    │   │   ├── finetune_ov.sh
    │   │   └── finetune_siglip.sh
    │   └── summarize_data.py
    ├── zero2_offload.json
    ├── zero2.json
    ├── zero2_fused_adamw.json
    ├── zero3.json
    ├── zero3_offload.json
    ├── zero3pp.json
    └── mira_train
    │   ├── pretrain_clip.sh
    │   ├── pretrain_biomedclip.sh
    │   ├── pretrain_siglip.sh
    │   ├── sft_clip.sh
    │   ├── sft_siglip.sh
    │   └── sft_biomedclip.sh
├── data-factory
    ├── data-merge
    │   ├── merge_data.py
    │   └── sample_images.py
    ├── rag-crawler
    │   └── preprocess_list.py
    └── rag-maker
    │   └── rewritor.py
└── README.md


/llava/serve/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/images/vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/images/vis.png


--------------------------------------------------------------------------------
/images/teasor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/images/teasor.png


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/images/architecture.png


--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/llava/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/llava/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/__pycache__/constants.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/mm_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/__pycache__/mm_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/__pycache__/train.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/train/__pycache__/train.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/__pycache__/conversation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/__pycache__/conversation.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/__pycache__/llava_arch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/__pycache__/llava_arch.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/__pycache__/llava_trainer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/train/__pycache__/llava_trainer.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/__pycache__/rag_handler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/train/__pycache__/rag_handler.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/language_model/__pycache__/llava_qwen.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/language_model/__pycache__/llava_mixtral.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/hf_vision.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/imagebind.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_resampler/__pycache__/qformer.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | import torch.multiprocessing as mp
3 | 
4 | if __name__ == "__main__":
5 |     mp.set_start_method('spawn')
6 |     train()
7 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/bmclip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/bmclip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_resampler/__pycache__/masked_drop.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_resampler/__pycache__/spatial_pool.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/MIRA/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/scripts/archived/interleave/eval_all.sh:
--------------------------------------------------------------------------------
1 | 
2 | # evaluate
3 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_in_domain
4 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_out_domain
5 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_view_in_domain


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data["question_id"]
15 |     cur_result[f"v1_{qid}"] = data["text"]
16 | 
17 | with open(args.dst, "w") as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16,
 8 |         "eva_model_name": "eva-clip-b-16",
 9 |         "ls_init_value": 0.1,
10 |         "drop_path_rate": 0.0
11 |     },
12 |     "text_cfg": {
13 |         "context_length": 77,
14 |         "vocab_size": 49408,
15 |         "width": 512,
16 |         "heads": 8,
17 |         "layers": 12
18 |     }
19 | }


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res["question_id"]
14 |     text = res["text"].rstrip(".").lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, "w") as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/sqa_eval_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | for IDX in {0..7}; do
 5 |     CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
 6 |         --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
 7 |         --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
 8 |         --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
 9 |         --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10 |         --num-chunks $CHUNKS \
11 |         --chunk-idx $IDX \
12 |         --conv-mode llava_v1 &
13 | done
14 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/sqa_eval_gather.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | output_file="test_llava-13b.jsonl"
 5 | 
 6 | # Clear out the output file if it exists.
 7 | > "$output_file"
 8 | 
 9 | # Loop through the indices and concatenate each file.
10 | for idx in $(seq 0 $((CHUNKS-1))); do
11 |   cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
12 | done
13 | 
14 | python llava/eval/eval_science_qa.py \
15 |     --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
16 |     --result-file ./test_llava-13b.jsonl \
17 |     --output-file ./test_llava-13b_output.json \
18 |     --output-result ./test_llava-13b_result.json
19 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/scripts/archived/interleave/eval_interleave_3d.sh:
--------------------------------------------------------------------------------
 1 | alias python=python3
 2 | CKPT_PATH=$1
 3 | NAME=$(echo "$CKPT_PATH" | awk -F'/' '{print $NF}')
 4 | echo $NAME
 5 | ##### set images path 
 6 | DATA_PATH=$2
 7 | EVAL_TYPE=$3
 8 | JSON_PATH=$2/$3.json
 9 | ############################### eval multi-image 
10 | RESULT_NAME="logs/${NAME}/${EVAL_TYPE}"
11 | echo $RESULT_NAME
12 | 
13 | mkdir -p logs/${NAME}
14 | 
15 | file_path=${RESULT_NAME}/result.jsonl
16 | 
17 | bash scripts/interleave/eval_multiprocess.sh \
18 | ${CKPT_PATH} \
19 | ${JSON_PATH} \
20 | ${RESULT_NAME} \
21 | ${DATA_PATH} \
22 | "" \
23 | 8 0
24 | 
25 | python3 llava/eval/evaluate_interleave.py --result-dir ${RESULT_NAME}
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0.4,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 768,
19 |         "heads": 12,
20 |         "layers": 12,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-g-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "fusedLN": true
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 1024,
19 |         "heads": 16,
20 |         "layers": 24,
21 |         "xattn": false,
22 |         "fusedLN": true
23 |     }
24 | }


--------------------------------------------------------------------------------
/data-factory/data-merge/merge_data.py:
--------------------------------------------------------------------------------
 1 | import os, json, ast, tqdm
 2 | 
 3 | full_info = json.loads(open('lmed_instruction_wrag.json').read())
 4 | write_list = []
 5 | 
 6 | for nowsetid in tqdm.tqdm(range(len(full_info))):
 7 |     if not os.path.exists(f"rewritten_data/{nowsetid}.json"):
 8 |         continue
 9 |     else:
10 |         nowset = ast.literal_eval(full_info[nowsetid])
11 |         rewritten_set = json.loads(open(f"rewritten_data/{nowsetid}.json").read())
12 | 
13 |         rewritten_set["conversations_original"] = nowset["conversations"]
14 | 
15 |         write_list.append(rewritten_set)
16 | 
17 | with open("lmed_instruction_rtra_51k.json", "w") as jsonf:
18 |     json.dump(write_list, jsonf)
19 | 
20 | print("Finished!")


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | AVAILABLE_MODELS = {
 4 |     "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig",
 5 |     "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig",
 6 |     "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig",
 7 |     "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig",
 8 |     # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig",    
 9 |     # Add other models as needed
10 | }
11 | 
12 | for model_name, model_classes in AVAILABLE_MODELS.items():
13 |     try:
14 |         exec(f"from .language_model.{model_name} import {model_classes}")
15 |     except Exception as e:
16 |         print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}")
17 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1024,
20 |         "heads": 16,
21 |         "layers": 24,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 64,
 6 |         "width": 1792,
 7 |         "head_width": 112,
 8 |         "mlp_ratio": 8.571428571428571,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-4b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": true,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 77,
 6 |         "width": 2304,
 7 |         "head_width": 144,
 8 |         "mlp_ratio": 10.9722,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-10b-14-x",
11 |         "drop_path_rate": 0,
12 |         "xattn": true,
13 |         "postnorm": false,
14 |         "fusedLN": true
15 |     },
16 |     "text_cfg": {
17 |         "context_length": 77,
18 |         "vocab_size": 49408,
19 |         "width": 1280,
20 |         "heads": 20,
21 |         "layers": 32,
22 |         "xattn": false,
23 |         "fusedLN": true
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1536,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 5120,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-18b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": true,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-plus-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1536,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 48,
 6 |         "width": 5120,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-18b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": true,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1280,
 3 |     "vision_cfg": {
 4 |         "image_size": 448,
 5 |         "layers": 32,
 6 |         "width": 4096,
 7 |         "head_width": 128,
 8 |         "mlp_ratio": 5,
 9 |         "patch_size": 14,
10 |         "eva_model_name": "eva-clip-8b-14-plus-x",
11 |         "drop_path_rate": 0,
12 |         "qkv_bias": false,
13 |         "xattn": true,
14 |         "postnorm": false,
15 |         "fusedLN": false,
16 |         "use_rms_norm": true
17 |     },
18 |     "text_cfg": {
19 |         "context_length": 77,
20 |         "vocab_size": 49408,
21 |         "width": 1280,
22 |         "heads": 20,
23 |         "layers": 32,
24 |         "xattn": false,
25 |         "fusedLN": false
26 |     }
27 | }


--------------------------------------------------------------------------------
/scripts/archived/video/eval/video_detail_description_eval_only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR="root to LLaVA-NeXT-Video"
 3 | 
 4 | if [ ! -e $ROOT_DIR ]; then
 5 |     echo "The root dir does not exist. Exiting the script."
 6 |     exit 1
 7 | fi
 8 | 
 9 | cd $ROOT_DIR
10 | 
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 | 
14 | OPENAIKEY="INPUT YOUR OPENAI API"
15 | 
16 | SAVE_DIR=$1
17 | 
18 | python3 llava/eval/evaluate_benchmark_video_detail_description.py \
19 |     --pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR/pred.json \
20 |     --output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \
21 |     --output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \
22 |     --num_chunks 1 \
23 |     --num_tasks 16 \
24 |     --api_key $OPENAIKEY \


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 512,
24 |         "heads": 8,
25 |         "layers": 12,
26 |         "xattn": true,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "head_width": 64,
 8 |         "patch_size": 16,
 9 |         "mlp_ratio": 2.6667,
10 |         "eva_model_name": "eva-clip-b-16-X",
11 |         "drop_path_rate": 0.0,
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 512,
24 |         "heads": 8,
25 |         "layers": 12,
26 |         "xattn": true,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "drop_path_rate": 0,
 8 |         "head_width": 64,
 9 |         "mlp_ratio": 2.6667,
10 |         "patch_size": 14,
11 |         "eva_model_name": "eva-clip-l-14-336",
12 |         "xattn": true,
13 |         "fusedLN": true,
14 |         "rope": true,
15 |         "pt_hw_seq_len": 16,
16 |         "intp_freq": true,
17 |         "naiveswiglu": true,
18 |         "subln": true
19 |     },
20 |     "text_cfg": {
21 |         "context_length": 77,
22 |         "vocab_size": 49408,
23 |         "width": 768,
24 |         "heads": 12,
25 |         "layers": 12,
26 |         "xattn": false,
27 |         "fusedLN": true
28 |     }
29 | }


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
 4 | from .loss import ClipLoss
 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
 6 | from .openai import load_openai_model, list_openai_models
 7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
 8 | from .tokenizer import SimpleTokenizer, tokenize
 9 | from .transform import image_transform
10 | 


--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map="cpu")
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "offload_optimizer": {
19 |             "device": "cpu",
20 |             "pin_memory": true
21 |         },
22 |         "offload_param": {
23 |             "device": "cpu",
24 |             "pin_memory": true
25 |         },
26 |         "overlap_comm": true,
27 |         "contiguous_gradients": true,
28 |         "sub_group_size": 1e9,
29 |         "reduce_bucket_size": "auto"
30 |     }
31 | }


--------------------------------------------------------------------------------
/data-factory/rag-crawler/preprocess_list.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import tqdm
 3 | 
 4 | disease_base = pd.read_csv('human_disease_textmining_full.tsv', sep='\t', header=None)
 5 | failed = success = 0
 6 | wikipedia_url_base = "https://en.wikipedia.org/wiki/"
 7 | 
 8 | task_list = []
 9 | dl_dup_set = []
10 | 
11 | url = []
12 | imdir = []
13 | tdir = []
14 | 
15 | for index, row in tqdm.tqdm(disease_base.iterrows(), total = len(disease_base)):
16 |     now_disease = row[3].replace(" ", "_")
17 |     if now_disease not in dl_dup_set:
18 |         dl_dup_set.append(now_disease)
19 |     else:
20 |         continue
21 | 
22 |     wikipedia_url = wikipedia_url_base + now_disease
23 |     url.append(wikipedia_url)
24 |     imdir.append("./downloaded_pages/images/"+now_disease)
25 |     tdir.append("./downloaded_pages/text/"+now_disease+".csv")
26 | 
27 | df = pd.DataFrame({"url": url, "image_dir": imdir, "text_dir": tdir})
28 | df.to_csv("full_disease_dl.csv", index=False, encoding='utf-8')
29 |     


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if "llava" in config and "llava" not in cfg.model_type:
 7 |         assert cfg.model_type == "llama"
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = "LlavaLlamaForCausalLM"
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/scripts/archived/qwen.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | import torch
 3 | 
 4 | device = "cuda"  # the device to load the model onto
 5 | 
 6 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat", torch_dtype=torch.bfloat16, device_map="auto")
 7 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat")
 8 | 
 9 | prompt = "Give me a short introduction to large language model."
10 | messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
11 | text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
12 | model_inputs = tokenizer([text], return_tensors="pt").to(device)
13 | 
14 | generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
15 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
16 | 
17 | response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
18 | 
19 | print(response)
20 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model import *
11 | from llava.model.utils import auto_upgrade
12 | 
13 | 
14 | def consolidate_ckpt(src_path, dst_path):
15 |     print("Loading model")
16 |     auto_upgrade(src_path)
17 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
18 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
19 |     src_model.save_pretrained(dst_path)
20 |     src_tokenizer.save_pretrained(dst_path)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--src", type=str, required=True)
26 |     parser.add_argument("--dst", type=str, required=True)
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     consolidate_ckpt(args.src, args.dst)
31 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/pooler_projector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import math
 5 | 
 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel
 7 | 
 8 | 
 9 | class PoolerProjector(nn.Module):
10 |     def __init__(self, config, vision_cfg):
11 |         super().__init__()
12 |         self._config = config
13 |         self.hw = vision_cfg.image_size // vision_cfg.patch_size
14 | 
15 |         self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
16 | 
17 |         self.proj = nn.Sequential(
18 |             nn.GELU(),
19 |             nn.Linear(config.hidden_size, config.hidden_size),
20 |         )
21 | 
22 |     def forward(self, x, *args, **kwargs):
23 |         height = width = self.hw
24 |         assert height * width == x.shape[1]
25 |         x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
26 |         x = self.conv_pool(x)
27 |         x = x.flatten(2).transpose(1, 2)
28 |         x = self.proj(x)
29 |         return x
30 | 
31 |     @property
32 |     def config(self):
33 |         return {"mm_projector_type": "pooler"}
34 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .masked_drop import MaskedDrop
 4 | from .spatial_pool import SpatialPool
 5 | from .perceiver import PerceiverResampler
 6 | from .qformer import Qformer
 7 | 
 8 | 
 9 | class IdentityMap(torch.nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 |     @property
17 |     def config(self):
18 |         return {"mm_resampler_type": None}
19 | 
20 | 
21 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
22 |     resampler_type = getattr(model_args, "mm_resampler_type", None)
23 |     if resampler_type == "masked_drop":
24 |         return MaskedDrop(model_args)
25 |     elif resampler_type == "spatial_pool":
26 |         return SpatialPool(model_args, **kwargs)
27 |     elif resampler_type == "perceiver":
28 |         return PerceiverResampler(model_args, **kwargs)
29 |     elif resampler_type == "qformer":
30 |         return Qformer(model_args, **kwargs)
31 |     elif resampler_type is None:
32 |         return IdentityMap()
33 | 
34 |     raise ValueError(f"Unknown resampler type: {resampler_type}")
35 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": false,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero2_fused_adamw.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 2,
24 |         "offload_optimizer": {
25 |             "device": "none",
26 |             "pin_memory": true
27 |         },
28 |         "allgather_partitions": true,
29 |         "allgather_bucket_size": 2e8,
30 |         "overlap_comm": true,
31 |         "reduce_scatter": true,
32 |         "reduce_bucket_size": 2e8,
33 |         "contiguous_gradients": true
34 |     },
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 3,
16 |         "offload_optimizer": {
17 |             "device": "none",
18 |             "pin_memory": true
19 |         },
20 |         "offload_param": {
21 |             "device": "none",
22 |             "pin_memory": true
23 |         },
24 |         "overlap_comm": true,
25 |         "contiguous_gradients": true,
26 |         "sub_group_size": 1e9,
27 |         "reduce_bucket_size": "auto",
28 |         "stage3_prefetch_bucket_size": "auto",
29 |         "stage3_param_persistence_threshold": "auto",
30 |         "stage3_max_live_parameters": 1e9,
31 |         "stage3_max_reuse_distance": 1e9,
32 |         "stage3_gather_16bit_weights_on_model_save": true
33 |     },
34 | 
35 |     "gradient_accumulation_steps": "auto",
36 |     "gradient_clipping": "auto",
37 |     "steps_per_print": 100,
38 |     "train_batch_size": "auto",
39 |     "train_micro_batch_size_per_gpu": "auto",
40 |     "wall_clock_breakdown": false
41 | }


--------------------------------------------------------------------------------
/scripts/archived/train/mid_stage.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/blip558k_stage1.5_finetune_w_prompt.json # released in lmms-lab/LLaVA-ReCap-*
 3 |     sampling_strategy: all
 4 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/coco118k_stage1.5_finetune_w_prompt.json # released in lmms-lab/LLaVA-ReCap-*
 5 |     sampling_strategy: all
 6 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/cc3m_recap_data_prompt_v2.json # released in lmms-lab/LLaVA-ReCap-*
 7 |     sampling_strategy: all
 8 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_tr_sft.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
 9 |     sampling_strategy: all
10 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/instruct_azure_dc_zh_92K.json # not released, explained at https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main/scripts/train
11 |     sampling_strategy: all
12 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/Evol-Instruct-GPT4-Turbo-143K.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
13 |     sampling_strategy: all
14 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_zh/synthdog_zh_100k.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
15 |     sampling_strategy: all
16 |   - json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_en/synthdog_en_100k.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
17 |     sampling_strategy: all


--------------------------------------------------------------------------------
/scripts/archived/video/demo/video_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/LLaVA-NeXT"
 3 | 
 4 | if [ ! -e $ROOT_DIR ]; then
 5 |     echo "The root dir does not exist. Exiting the script."
 6 |     exit 1
 7 | fi
 8 | 
 9 | cd $ROOT_DIR
10 | 
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 | 
14 | CKPT=$1
15 | CONV_MODE=$2
16 | FRAMES=$3
17 | POOL_STRIDE=$4
18 | POOL_MODE=$5
19 | NEWLINE_POSITION=$6
20 | OVERWRITE=$7
21 | VIDEO_PATH=$8
22 | 
23 | 
24 | if [ "$OVERWRITE" = False ]; then
25 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
26 | 
27 | else
28 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
29 | fi
30 |     
31 | python3 playground/demo/video_demo.py \
32 |     --model-path $CKPT \
33 |     --video_path ${VIDEO_PATH} \
34 |     --output_dir ./work_dirs/video_demo/$SAVE_DIR \
35 |     --output_name pred \
36 |     --chunk-idx $(($IDX - 1)) \
37 |     --overwrite ${OVERWRITE} \
38 |     --mm_spatial_pool_stride ${POOL_STRIDE:-4} \
39 |     --for_get_frames_num $FRAMES \
40 |     --conv-mode $CONV_MODE \
41 |     --mm_spatial_pool_mode ${POOL_MODE:-average} \
42 |     --mm_newline_position ${NEWLINE_POSITION:-grid} \
43 |     --prompt "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes."


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.3 \
 6 |     --version $PROMPT_VERSION \
 7 |     --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
 8 |     --image_folder /Data/ScienceQA/data/scienceqa/images/train \
 9 |     --vision_tower openai/clip-vit-large-patch14 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
11 |     --mm_vision_select_layer -2 \
12 |     --mm_use_im_start_end False \
13 |     --mm_use_im_patch_token False \
14 |     --bf16 True \
15 |     --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
16 |     --num_train_epochs 12 \
17 |     --per_device_train_batch_size 16 \
18 |     --per_device_eval_batch_size 4 \
19 |     --gradient_accumulation_steps 1 \
20 |     --evaluation_strategy "no" \
21 |     --save_strategy "steps" \
22 |     --save_steps 50000 \
23 |     --save_total_limit 1 \
24 |     --learning_rate 2e-5 \
25 |     --weight_decay 0. \
26 |     --warmup_ratio 0.03 \
27 |     --lr_scheduler_type "cosine" \
28 |     --logging_steps 1 \
29 |     --tf32 True \
30 |     --model_max_length 2048 \
31 |     --gradient_checkpointing True \
32 |     --dataloader_num_workers 16 \
33 |     --lazy_preprocess True \
34 |     --report_to wandb
35 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/entry_cmd.sh:
--------------------------------------------------------------------------------
 1 | python3 -m pip install --upgrade pip;
 2 | 
 3 | export http_proxy=http://sys-proxy-rd-relay.byted.org:8118;
 4 | export https_proxy=http://sys-proxy-rd-relay.byted.org:8118;
 5 | 
 6 | export HF_HOME=/mnt/bn/vl-research-boli01-cn/.cache/huggingface;
 7 | export HF_TOKEN="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";
 8 | export HF_HUB_ENABLE_HF_TRANSFER="1";
 9 | 
10 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/lmms-eval;
11 | pip install -e .;
12 | 
13 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next;
14 | pip install -e .;
15 | 
16 | python3 -m pip install ninja;
17 | python3 -m pip install flash-attn --no-build-isolation;
18 | 
19 | bash /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/cn_scripts/vicuna/internal0.6m_finetune_llava1.6mix_7b_v0.2_unfreeze.sh
20 | 
21 | 
22 | accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \
23 |     --model llava \
24 |     --model_args pretrained="/mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/internal_project_checkpoints/llavanext-lmsys_vicuna-7b-v1.5-clip-vit-large-patch14-336-mlp2x_gelu-pretrain_internal0.6m_vicuna_v1_finetune_llava1.6_datamix_unfreezeVIS_1e" \
25 |     --tasks ok_vqa,textcaps_val,mme_test,mmmu,cmmmu,coco2017_cap_val,vizwiz_vqa_val,ai2d,chartqa,pope \
26 |     --batch_size 1 \
27 |     --log_samples \
28 |     --log_samples_suffix debug \
29 |     --output_path ./logs/  \
30 |     --wandb_args 'project=llava-next-lmms-eval,job_type=eval';


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 |     "zero_optimization": {
23 |         "stage": 3,
24 |         "offload_optimizer": {
25 |             "device": "cpu",
26 |             "pin_memory": true
27 |         },
28 |         "offload_param": {
29 |             "device": "cpu",
30 |             "pin_memory": true
31 |         },
32 |         "overlap_comm": true,
33 |         "contiguous_gradients": true,
34 |         "sub_group_size": 1e9,
35 |         "reduce_bucket_size": "auto",
36 |         "stage3_prefetch_bucket_size": "auto",
37 |         "stage3_param_persistence_threshold": "auto",
38 |         "stage3_max_live_parameters": 1e9,
39 |         "stage3_max_reuse_distance": 1e9,
40 |         "gather_16bit_weights_on_model_save": true
41 |     },
42 |     "gradient_accumulation_steps": "auto",
43 |     "gradient_clipping": "auto",
44 |     "train_batch_size": "auto",
45 |     "train_micro_batch_size_per_gpu": "auto",
46 |     "steps_per_print": 1e5,
47 |     "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | # MODEL_VERSION=vicuna-v1-3-7b
 6 | # MODEL_VERSION=llama-2-7b-chat
 7 | 
 8 | ########### DO NOT CHANGE ###########
 9 | ########### USE THIS FOR BOTH ###########
10 | PROMPT_VERSION=plain
11 | ########### DO NOT CHANGE ###########
12 | 
13 | deepspeed llava/train/train_mem.py \
14 |     --deepspeed ./scripts/zero2.json \
15 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
16 |     --version $PROMPT_VERSION \
17 |     --data_path /path/to/pretrain_data.json \
18 |     --image_folder /path/to/images \
19 |     --vision_tower openai/clip-vit-large-patch14 \
20 |     --tune_mm_mlp_adapter True \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --bf16 True \
25 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 16 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 1 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --learning_rate 2e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 16 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--annotation-file", type=str, required=True)
11 |     parser.add_argument("--result-file", type=str, required=True)
12 |     parser.add_argument("--result-upload-file", type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     results = {x["question_id"]: x["text"] for x in results}
30 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
31 |     split_ids = set([x["question_id"] for x in test_split])
32 | 
33 |     print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
34 | 
35 |     all_answers = []
36 | 
37 |     answer_processor = EvalAIAnswerProcessor()
38 | 
39 |     for x in test_split:
40 |         # import pdb; pdb.set_trace()
41 |         assert x["question_id"] in results, print(x)
42 |         all_answers.append({"image": x["image"], "answer": answer_processor(results[x["question_id"]])})
43 | 
44 |     with open(args.result_upload_file, "w") as f:
45 |         json.dump(all_answers, f)
46 | 


--------------------------------------------------------------------------------
/scripts/zero3pp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "zero_optimization": {
24 |         "stage": 3,
25 |         "offload_optimizer": {
26 |             "device": "none",
27 |             "pin_memory": true
28 |         },
29 |         "offload_param": {
30 |             "device": "none",
31 |             "pin_memory": true
32 |         },
33 |         "overlap_comm": true,
34 |         "contiguous_gradients": true,
35 |         "zero_quantized_weights": true,
36 |         "zero_hpz_partition_size": 16,
37 |         "zero_quantized_gradients": true,
38 |         "sub_group_size": 1e9,
39 |         "reduce_bucket_size": "auto",
40 |         "stage3_prefetch_bucket_size": "auto",
41 |         "stage3_param_persistence_threshold": "auto",
42 |         "stage3_max_live_parameters": 1e9,
43 |         "stage3_max_reuse_distance": 1e9,
44 |         "stage3_gather_16bit_weights_on_model_save": true
45 |     },
46 | 
47 |     "gradient_accumulation_steps": "auto",
48 |     "gradient_clipping": "auto",
49 |     "steps_per_print": 100,
50 |     "train_batch_size": "auto",
51 |     "train_micro_batch_size_per_gpu": "auto",
52 |     "wall_clock_breakdown": false
53 | }


--------------------------------------------------------------------------------
/data-factory/rag-maker/rewritor.py:
--------------------------------------------------------------------------------
 1 | import re, requests
 2 | 
 3 | def markdown_seq_check(text):
 4 |     # Define common markdown patterns
 5 |     markdown_patterns = [
 6 |         r"^#{1,6}\s",                # Headers (e.g. #, ##, ###)
 7 |         # r"\*\*.*?\*\*",               # Bold (e.g. **bold**)
 8 |         # r"\*.*?\*",                    # Italic (e.g. *italic*)
 9 |         # r"\[.*?\]\(.*?\)",             # Links (e.g. [text](url))
10 |         r"^\s*[-*+]\s+",               # Unordered lists (- item, * item, + item)
11 |         r"^\d+\.\s+",                   # Ordered lists (1. item)
12 |         # r"`.*?`",                       # Inline code (e.g. `code`)
13 |         r"```[\s\S]*?```",              # Code blocks (e.g. ```code```)
14 |         r"^>.*",                        # Blockquotes (e.g. > quote)
15 |         # r"_{2}.*?_{2}",                 # Underline (_italic_)
16 |     ]
17 |     
18 |     markdown_regex = re.compile("|".join(markdown_patterns), re.MULTILINE)
19 |     return bool(markdown_regex.search(text))
20 | 
21 | def API_summarization(paragraph, host_url, target_model):
22 |     prompt = [
23 |         {"role": "system", "content": "Please simpilify user's given text into one paragraph. Please keep necessary information from the original paragraph so that reader can clearly understand what the paragraph is about. Please describe from an encyclopedic perspective, i.e. use third person perspective. Only output one clear and informative paragraph with no format and no citations. Do not make it like markdown, make it ONE PARAGRAPH."},
24 |         {"role": "user", "content": paragraph}
25 |     ]
26 |     results = requests.post(host_url, json={"model": target_model, "messages": prompt, "stream": False}).json()
27 | 
28 |     return results["message"]["content"]


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_full_schedule.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | ################## VICUNA ##################
 6 | # PROMPT_VERSION=v1
 7 | # MODEL_VERSION="vicuna-v1-3-7b"
 8 | ################## VICUNA ##################
 9 | 
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 | 
15 | deepspeed llava/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
18 |     --version $PROMPT_VERSION \
19 |     --data_path ./playground/data/llava_instruct_158k.json \
20 |     --image_folder /path/to/coco/train2017 \
21 |     --vision_tower openai/clip-vit-large-patch14 \
22 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --bf16 True \
27 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
28 |     --num_train_epochs 3 \
29 |     --per_device_train_batch_size 16 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 1 \
32 |     --evaluation_strategy "no" \
33 |     --save_strategy "steps" \
34 |     --save_steps 50000 \
35 |     --save_total_limit 1 \
36 |     --learning_rate 2e-5 \
37 |     --weight_decay 0. \
38 |     --warmup_ratio 0.03 \
39 |     --lr_scheduler_type "cosine" \
40 |     --logging_steps 1 \
41 |     --tf32 True \
42 |     --model_max_length 2048 \
43 |     --gradient_checkpointing True \
44 |     --dataloader_num_workers 16 \
45 |     --lazy_preprocess True \
46 |     --report_to wandb
47 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | ################## VICUNA ##################
 6 | # PROMPT_VERSION=v1
 7 | # MODEL_VERSION="vicuna-v1-3-7b"
 8 | ################## VICUNA ##################
 9 | 
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 | 
15 | deepspeed llava/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --lora_enable True \
18 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
19 |     --version $PROMPT_VERSION \
20 |     --data_path ./playground/data/llava_instruct_80k.json \
21 |     --image_folder /path/to/coco/train2017 \
22 |     --vision_tower openai/clip-vit-large-patch14 \
23 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
24 |     --mm_vision_select_layer -2 \
25 |     --mm_use_im_start_end False \
26 |     --mm_use_im_patch_token False \
27 |     --bf16 True \
28 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
29 |     --num_train_epochs 1 \
30 |     --per_device_train_batch_size 16 \
31 |     --per_device_eval_batch_size 4 \
32 |     --gradient_accumulation_steps 1 \
33 |     --evaluation_strategy "no" \
34 |     --save_strategy "steps" \
35 |     --save_steps 50000 \
36 |     --save_total_limit 1 \
37 |     --learning_rate 2e-5 \
38 |     --weight_decay 0. \
39 |     --warmup_ratio 0.03 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 1 \
42 |     --tf32 True \
43 |     --model_max_length 2048 \
44 |     --gradient_checkpointing True \
45 |     --lazy_preprocess True \
46 |     --dataloader_num_workers 16 \
47 |     --report_to wandb
48 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | ################## VICUNA ##################
 6 | # PROMPT_VERSION=v1
 7 | # MODEL_VERSION="vicuna-v1-3-7b"
 8 | ################## VICUNA ##################
 9 | 
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 | 
15 | deepspeed llava/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --lora_enable True \
18 |     --bits 4 \
19 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 |     --version $PROMPT_VERSION \
21 |     --data_path ./playground/data/llava_instruct_80k.json \
22 |     --image_folder /path/to/coco/train2017 \
23 |     --vision_tower openai/clip-vit-large-patch14 \
24 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --lazy_preprocess True \
47 |     --dataloader_num_workers 16 \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--dir", type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument("--ckpt", type=str, required=True)
12 |     parser.add_argument("--split", type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, "answers", args.split, args.ckpt, "merge.jsonl")
21 |     test_split = os.path.join(args.dir, "llava_vqav2_mscoco_test2015.jsonl")
22 |     dst = os.path.join(args.dir, "answers_upload", args.split, f"{args.ckpt}.json")
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x["question_id"]: x["text"] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x["question_id"] for x in test_split])
36 | 
37 |     print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x["question_id"] not in results:
45 |             all_answers.append({"question_id": x["question_id"], "answer": ""})
46 |         else:
47 |             all_answers.append({"question_id": x["question_id"], "answer": answer_processor(results[x["question_id"]])})
48 | 
49 |     with open(dst, "w") as f:
50 |         json.dump(all_answers, open(dst, "w"))
51 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/spatial_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | 
 6 | class SpatialPool(nn.Module):
 7 |     def __init__(self, model_args, vision_tower):
 8 |         super().__init__()
 9 | 
10 |         self.mode = model_args.mm_spatial_pool_mode
11 |         self.stride = model_args.mm_spatial_pool_stride
12 |         self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
13 | 
14 |         if self.mode == "average":
15 |             self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
16 |         elif self.mode == "max":
17 |             self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
18 |         elif self.mode == "conv":
19 |             self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
20 |         else:
21 |             raise ValueError(f"Unknown pooling mode: {self.pool}.")
22 | 
23 |     def forward(self, image_features, images, *args, **kwargs):
24 |         ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
25 |         ori_H = int(ori_W * images.shape[2] // images.shape[3])
26 | 
27 |         B, _, F = image_features.shape
28 | 
29 |         image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
30 |         image_features_spatial_pool = self.pool(image_features_spatial)
31 | 
32 |         return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
33 | 
34 |     @property
35 |     def config(self):
36 |         return {
37 |             "mm_resampler_type": "spatial_pool",
38 |             "mm_spatial_pool_stride": self.stride,
39 |             "mm_spatial_pool_mode": self.mode,
40 |             "mm_spatial_pool_out_channels": self.out_channels,
41 |         }
42 | 
43 |     @property
44 |     def hidden_size(self):
45 |         return self.out_channels
46 | 


--------------------------------------------------------------------------------
/scripts/archived/interleave/eval_multiprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if three arguments are passed
 4 | if [ "$#" -ne 7 ]; then
 5 |     echo "Usage: $0 <model_path> <question_path> <base_answer_path> <image_folder> <extra_prompt> <N> <temperature>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Assign the command line arguments to variables
10 | model_path=$1
11 | question_path=$2
12 | base_answer_path=$3
13 | image_folder=$4
14 | extra_prompt=$5
15 | N=$6
16 | temperature=$7
17 | 
18 | # Loop over each chunk/process
19 | for (( chunk_id=0; chunk_id<N; chunk_id++ ))
20 | do
21 |     # Define the answer path for each chunk
22 |     answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
23 |     if [ -f "$answer_path" ]; then
24 |         rm "$answer_path"
25 |     fi
26 |     # Run the Python program in the background
27 |     CUDA_VISIBLE_DEVICES="$chunk_id" python3 llava/eval/model_vqa.py --model-path "$model_path" --question-file "$question_path" --answers-file "$answer_path" --num-chunks "$N" --chunk-idx "$chunk_id" --image-folder "$image_folder" --extra-prompt "$extra_prompt" --temperature "$temperature" &
28 | 
29 |     # Uncomment below if you need a slight delay between starting each process
30 |     # sleep 0.1
31 | done
32 | 
33 | # Wait for all background processes to finish
34 | wait
35 | 
36 | merged_file="${base_answer_path}/result.jsonl"
37 | if [ -f "$merged_file" ]; then
38 |     rm "$merged_file"
39 | fi
40 | # Merge all the JSONL files into one
41 | #cat "${base_answer_path}"_*.jsonl > "${base_answer_path}.jsonl"
42 | for ((i=0; i<N; i++)); do
43 |   input_file="${base_answer_path}/result_${i}.jsonl"
44 |   cat "$input_file" >> "${base_answer_path}/result.jsonl"
45 | done
46 | # remove the unmerged files
47 | for (( chunk_id=0; chunk_id<N; chunk_id++ ))
48 | do
49 |     # Define the answer path for each chunk
50 |     answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
51 |     if [ -f "$answer_path" ]; then
52 |         rm "$answer_path"
53 |     fi
54 | done


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/factory.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import pathlib
 5 | import re
 6 | from copy import deepcopy
 7 | from pathlib import Path
 8 | from typing import Optional, Tuple, Union, Dict, Any
 9 | import torch
10 | 
11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
12 | _MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
13 | 
14 | 
15 | def _natural_key(string_):
16 |     return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
17 | 
18 | 
19 | def _rescan_model_configs():
20 |     global _MODEL_CONFIGS
21 | 
22 |     config_ext = (".json",)
23 |     config_files = []
24 |     for config_path in _MODEL_CONFIG_PATHS:
25 |         if config_path.is_file() and config_path.suffix in config_ext:
26 |             config_files.append(config_path)
27 |         elif config_path.is_dir():
28 |             for ext in config_ext:
29 |                 config_files.extend(config_path.glob(f"*{ext}"))
30 | 
31 |     for cf in config_files:
32 |         with open(cf, "r", encoding="utf8") as f:
33 |             model_cfg = json.load(f)
34 |             if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")):
35 |                 _MODEL_CONFIGS[cf.stem] = model_cfg
36 | 
37 |     _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
38 | 
39 | 
40 | _rescan_model_configs()  # initial populate of model config registry
41 | 
42 | 
43 | def list_models():
44 |     """enumerate available model architectures based on config files"""
45 |     return list(_MODEL_CONFIGS.keys())
46 | 
47 | 
48 | def add_model_config(path):
49 |     """add model config path or file and update registry"""
50 |     if not isinstance(path, Path):
51 |         path = Path(path)
52 |     _MODEL_CONFIG_PATHS.append(path)
53 |     _rescan_model_configs()
54 | 
55 | 
56 | def get_model_config(model_name):
57 |     if model_name in _MODEL_CONFIGS:
58 |         return deepcopy(_MODEL_CONFIGS[model_name])
59 |     else:
60 |         return None
61 | 


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | from llava import LlavaLlamaForCausalLM
12 | 
13 | 
14 | def apply_delta(base_model_path, target_model_path, delta_path):
15 |     print("Loading base model")
16 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
31 |             bparam = base.state_dict()[name]
32 |             param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
33 | 
34 |     print("Saving target model")
35 |     delta.save_pretrained(target_model_path)
36 |     delta_tokenizer.save_pretrained(target_model_path)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument("--base-model-path", type=str, required=True)
42 |     parser.add_argument("--target-model-path", type=str, required=True)
43 |     parser.add_argument("--delta-path", type=str, required=True)
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
48 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | from .imagebind import ImageBindWrapper
 4 | from .open_clip_encoder import OpenCLIPVisionTower
 5 | from .hf_vision import HFVisionTower
 6 | from .siglip_encoder import SigLipVisionTower
 7 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 8 | from .bmclip_encoder import BMClipVisionTower
 9 | 
10 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower
11 | # from .dev_eva_clip.eva_vit import EvaViTWrapper
12 | 
13 | 
14 | def build_vision_tower(vision_tower_cfg, **kwargs):
15 |     vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
16 |     is_absolute_path_exists = os.path.exists(vision_tower)
17 |     use_s2 = getattr(vision_tower_cfg, "s2", False)
18 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
19 |         if use_s2:
20 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
21 |         else:
22 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
23 |     elif "siglip" in vision_tower:
24 |         return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
25 |     elif vision_tower.startswith("hf:"):
26 |         return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
27 |     elif vision_tower in ["imagebind_huge"]:
28 |         return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
29 |     elif vision_tower.startswith("open_clip_hub"):
30 |         return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
31 |     elif vision_tower.startswith("biomedclip"):
32 |         return BMClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
33 |     # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower():
34 |     #     return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
35 |     # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]:
36 |     #     return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
37 | 
38 |     raise ValueError(f"Unknown vision tower: {vision_tower}")
39 | 


--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name})
21 |         worker_addr = ret.json()["address"]
22 |         print(f"worker_addr: {worker_addr}")
23 | 
24 |     if worker_addr == "":
25 |         return
26 | 
27 |     conv = default_conversation.copy()
28 |     conv.append_message(conv.roles[0], args.message)
29 |     prompt = conv.get_prompt()
30 | 
31 |     headers = {"User-Agent": "LLaVA Client"}
32 |     pload = {
33 |         "model": args.model_name,
34 |         "prompt": prompt,
35 |         "max_new_tokens": args.max_new_tokens,
36 |         "temperature": 0.7,
37 |         "stop": conv.sep,
38 |     }
39 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True)
40 | 
41 |     print(prompt.replace(conv.sep, "\n"), end="")
42 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
43 |         if chunk:
44 |             data = json.loads(chunk.decode("utf-8"))
45 |             output = data["text"].split(conv.sep)[-1]
46 |             print(output, end="\r")
47 |     print("")
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
53 |     parser.add_argument("--worker-address", type=str)
54 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
55 |     parser.add_argument("--max-new-tokens", type=int, default=32)
56 |     parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.")
57 |     args = parser.parse_args()
58 | 
59 |     main()
60 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/quick_check.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import argparse
 4 | from tqdm import tqdm
 5 | import yaml
 6 | 
 7 | 
 8 | def check_missing_images(json_path, images_folder):
 9 |     data = json.load(open(json_path, "r"))
10 |     missing_data = []
11 | 
12 |     for i, d in enumerate(tqdm(data)):
13 |         image = d["image"] if "image" in d else ""
14 |         if image != "":
15 |             path = os.path.join(images_folder, image)
16 |             if not os.path.exists(path):
17 |                 print(f"Missing image: {path}")
18 |                 missing_data.append(d)
19 | 
20 |     return missing_data
21 | 
22 | 
23 | def read_yaml_to_llava_data(yaml_path, images_folder):
24 |     print(f"Reading YAML file: {yaml_path}")
25 |     with open(yaml_path, "r") as f:
26 |         data = yaml.safe_load(f)
27 | 
28 |     llava_json_paths = data["datasets"]
29 |     for item in llava_json_paths:
30 |         json_path = item["json_path"]
31 |         missing_data = check_missing_images(json_path, images_folder)
32 |         if len(missing_data) > 0:
33 |             print(f"Missing images in {json_path}:")
34 |             for d in missing_data:
35 |                 print(d)
36 | 
37 | 
38 | def direct_check_llava_data(json_path, images_folder):
39 |     missing_data = check_missing_images(json_path, images_folder)
40 |     if len(missing_data) > 0:
41 |         print(f"Missing images in {json_path}:")
42 |         for d in missing_data:
43 |             print(d)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
48 |     parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
49 |     parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
50 |     parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")
51 | 
52 |     args = parser.parse_args()
53 | 
54 |     if args.json_path != "":
55 |         direct_check_llava_data(args.json_path, args.images_folder)
56 |     elif args.yaml_path != "":
57 |         read_yaml_to_llava_data(args.yaml_path, args.images_folder)
58 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py:
--------------------------------------------------------------------------------
 1 | # HF architecture dict:
 2 | arch_dict = {
 3 |     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
 4 |     "roberta": {
 5 |         "config_names": {
 6 |             "context_length": "max_position_embeddings",
 7 |             "vocab_size": "vocab_size",
 8 |             "width": "hidden_size",
 9 |             "heads": "num_attention_heads",
10 |             "layers": "num_hidden_layers",
11 |             "layer_attr": "layer",
12 |             "token_embeddings_attr": "embeddings",
13 |         },
14 |         "pooler": "mean_pooler",
15 |     },
16 |     # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17 |     "xlm-roberta": {
18 |         "config_names": {
19 |             "context_length": "max_position_embeddings",
20 |             "vocab_size": "vocab_size",
21 |             "width": "hidden_size",
22 |             "heads": "num_attention_heads",
23 |             "layers": "num_hidden_layers",
24 |             "layer_attr": "layer",
25 |             "token_embeddings_attr": "embeddings",
26 |         },
27 |         "pooler": "mean_pooler",
28 |     },
29 |     # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30 |     "mt5": {
31 |         "config_names": {
32 |             # unlimited seqlen
33 |             # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34 |             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35 |             "context_length": "",
36 |             "vocab_size": "vocab_size",
37 |             "width": "d_model",
38 |             "heads": "num_heads",
39 |             "layers": "num_layers",
40 |             "layer_attr": "block",
41 |             "token_embeddings_attr": "embed_tokens",
42 |         },
43 |         "pooler": "mean_pooler",
44 |     },
45 |     "bert": {
46 |         "config_names": {
47 |             "context_length": "max_position_embeddings",
48 |             "vocab_size": "vocab_size",
49 |             "width": "hidden_size",
50 |             "heads": "num_attention_heads",
51 |             "layers": "num_hidden_layers",
52 |             "layer_attr": "layer",
53 |             "token_embeddings_attr": "embeddings",
54 |         },
55 |         "pooler": "mean_pooler",
56 |     },
57 | }
58 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | from .pooler_projector import PoolerProjector
 6 | 
 7 | 
 8 | class IdentityMap(nn.Module):
 9 |     def __init__(self):
10 |         super().__init__()
11 | 
12 |     def forward(self, x, *args, **kwargs):
13 |         return x
14 | 
15 |     @property
16 |     def config(self):
17 |         return {"mm_projector_type": "identity"}
18 | 
19 | 
20 | class SimpleResBlock(nn.Module):
21 |     def __init__(self, channels):
22 |         super().__init__()
23 |         self.pre_norm = nn.LayerNorm(channels)
24 | 
25 |         self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
26 | 
27 |     def forward(self, x):
28 |         x = self.pre_norm(x)
29 |         return x + self.proj(x)
30 | 
31 | 
32 | def build_vision_projector(config, delay_load=False, **kwargs):
33 |     projector_type = getattr(config, "mm_projector_type", "linear")
34 | 
35 |     if projector_type == "linear":
36 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
37 | 
38 |     if projector_type == "pooler":
39 |         return PoolerProjector(config, kwargs["vision_cfg"])
40 | 
41 |     mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
42 |     if mlp_gelu_match:
43 |         mlp_depth = int(mlp_gelu_match.group(1))
44 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
45 |         for _ in range(1, mlp_depth):
46 |             modules.append(nn.GELU())
47 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
48 |         return nn.Sequential(*modules)
49 | 
50 |     mlp_gelu_resnet_match = re.match(r"^mlp(\d+)x_res(\d+)x_gelu$", projector_type)
51 |     if mlp_gelu_resnet_match:
52 |         mlp_depth = int(mlp_gelu_resnet_match.group(1))
53 |         res_depth = int(mlp_gelu_resnet_match.group(2))
54 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
55 |         for _ in range(1, mlp_depth):
56 |             modules.append(nn.GELU())
57 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
58 |         for _ in range(res_depth):
59 |             modules.append(SimpleResBlock(config.hidden_size))
60 |         return nn.Sequential(*modules)
61 | 
62 |     if projector_type == "identity":
63 |         return IdentityMap()
64 | 
65 |     raise ValueError(f"Unknown projector type: {projector_type}")
66 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | import torch
 9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | from llava.model.utils import auto_upgrade
12 | 
13 | 
14 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
15 |     print("Loading base model")
16 |     base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
31 |             bparam = base.state_dict()[name]
32 |             param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | # Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP
 3 | """
 4 | 
 5 | from torchvision import transforms
 6 | from torchvision.transforms.functional import InterpolationMode
 7 | from transformers.image_processing_utils import BatchFeature
 8 | from PIL import Image
 9 | from transformers.image_transforms import convert_to_rgb
10 | 
11 | 
12 | class BaseProcessor:
13 |     def __init__(self):
14 |         self.transform = lambda x: x
15 |         return
16 | 
17 |     def __call__(self, item):
18 |         return self.transform(item)
19 | 
20 | 
21 | class EvaClipImageBaseProcessor(BaseProcessor):
22 |     def __init__(self, mean=None, std=None):
23 |         self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean
24 |         self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std
25 | 
26 |         self.normalize = transforms.Normalize(self.mean, self.std)
27 | 
28 |     @property
29 |     def image_mean(self):
30 |         return self.mean
31 | 
32 | 
33 | class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor):
34 |     def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
35 |         super().__init__(mean=mean, std=std)
36 | 
37 |         self.transform = transforms.Compose(
38 |             [
39 |                 convert_to_rgb,
40 |                 transforms.Resize(
41 |                     image_size,
42 |                     interpolation=InterpolationMode.BICUBIC,
43 |                 ),
44 |                 transforms.CenterCrop(image_size),
45 |                 transforms.ToTensor(),
46 |                 self.normalize,
47 |             ]
48 |         )
49 | 
50 |         self.image_size = image_size
51 | 
52 |     def preprocess(self, images, return_tensors):
53 |         if isinstance(images, Image.Image):
54 |             images = [images]
55 |         else:
56 |             assert isinstance(images, list)
57 | 
58 |         transformed_images = [self.transform(image).numpy() for image in images]
59 |         data = {"pixel_values": transformed_images}
60 | 
61 |         return BatchFeature(data=data, tensor_type=return_tensors)
62 | 
63 |     def __call__(self, item):
64 |         return self.transform(item)
65 | 
66 |     @property
67 |     def crop_size(self):
68 |         return {"height": self.image_size, "width": self.image_size}
69 | 
70 |     @property
71 |     def size(self):
72 |         return {"shortest_edge": self.image_size}
73 | 


--------------------------------------------------------------------------------
/scripts/archived/train/dpo_ov7b.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
 5 | export NCCL_SOCKET_IFNAME=eth0
 6 | export NCCL_DEBUG=INFO
 7 | 
 8 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
 9 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
10 | 
11 | # DPO Stage
12 | PROMPT_VERSION="qwen_1_5"
13 | SFT_MODEL="lmms-lab/llava-onevision-qwen2-7b-ov"
14 | EPOCH=1
15 | beta=0.1
16 | 
17 | DPO_RUN_NAME="llava-onevision-qwen2-7b-ov_dpo-beta${beta}-epoch${EPOCH}"
18 | DPO_CLEAN_NAME="${DPO_RUN_NAME##*/}"
19 | OUTPUT_DIR="<your-output-folder>/${DPO_CLEAN_NAME}"
20 | DATA_PATH="<your-data-path>"
21 | 
22 | echo $DPO_RUN_NAME
23 | 
24 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
25 |     llava/train/train_dpo.py \
26 |     --deepspeed scripts/zero3.json \
27 |     --model_name_or_path=${SFT_MODEL} \
28 |     --dpo_alpha=1.0 \
29 |     --beta=${beta} \
30 |     --gamma=0 \
31 |     --version $PROMPT_VERSION \
32 |     --data_path=$DATA_PATH \
33 |     --image_folder "<your-image-folder>" \
34 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
35 |     --unfreeze_mm_vision_tower True \
36 |     --vision_tower ${VISION_MODEL_VERSION} \
37 |     --mm_projector_type mlp2x_gelu \
38 |     --mm_vision_select_layer -2 \
39 |     --mm_use_im_start_end False \
40 |     --mm_use_im_patch_token False \
41 |     --group_by_modality_length True \
42 |     --image_aspect_ratio anyres_max_9 \
43 |     --image_grid_pinpoints "(1x1),...,(6x6)" \
44 |     --mm_patch_merge_type spatial_unpad \
45 |     --bf16 True \
46 |     --run_name $DPO_CLEAN_NAME \
47 |     --output_dir $OUTPUT_DIR \
48 |     --num_train_epochs $EPOCH \
49 |     --per_device_train_batch_size 1 \
50 |     --per_device_eval_batch_size 1 \
51 |     --gradient_accumulation_steps 8 \
52 |     --evaluation_strategy "no" \
53 |     --save_strategy "steps" \
54 |     --save_steps 1000 \
55 |     --save_total_limit 1 \
56 |     --learning_rate 5e-7 \
57 |     --weight_decay 0. \
58 |     --warmup_ratio 0.1 \
59 |     --lr_scheduler_type "cosine" \
60 |     --logging_steps 1 \
61 |     --tf32 True \
62 |     --model_max_length 32768 \
63 |     --gradient_checkpointing True \
64 |     --dataloader_num_workers 4 \
65 |     --lazy_preprocess True \
66 |     --report_to wandb \
67 |     --dataloader_drop_last True
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/scripts/archived/train/direct_finetune_clip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="Qwen/Qwen2-7B-Instruct"
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION="qwen_1_5"
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
20 |     llava/train/train_mem.py \
21 |     --deepspeed scripts/zero3.json \
22 |     --model_name_or_path ${LLM_VERSION} \
23 |     --version ${PROMPT_VERSION} \
24 |     --data_path=llava_1_6.json \
25 |     --image_folder your_image_folder \
26 |     --pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
27 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
28 |     --mm_vision_tower_lr=2e-6 \
29 |     --vision_tower ${VISION_MODEL_VERSION} \
30 |     --mm_projector_type mlp2x_gelu \
31 |     --mm_vision_select_layer -2 \
32 |     --mm_use_im_start_end False \
33 |     --mm_use_im_patch_token False \
34 |     --group_by_modality_length True \
35 |     --image_aspect_ratio anyres \
36 |     --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
37 |     --mm_patch_merge_type spatial_unpad \
38 |     --bf16 True \
39 |     --run_name $MID_RUN_NAME \
40 |     --output_dir "/checkpoints/${MID_RUN_NAME}" \
41 |     --num_train_epochs 1 \
42 |     --per_device_train_batch_size 4 \
43 |     --per_device_eval_batch_size 4 \
44 |     --gradient_accumulation_steps 1 \
45 |     --evaluation_strategy "no" \
46 |     --save_strategy "steps" \
47 |     --save_steps 3000 \
48 |     --save_total_limit 1 \
49 |     --learning_rate 1e-5 \
50 |     --weight_decay 0. \
51 |     --warmup_ratio 0.03 \
52 |     --lr_scheduler_type "cosine" \
53 |     --logging_steps 1 \
54 |     --tf32 True \
55 |     --model_max_length 32768 \
56 |     --gradient_checkpointing True \
57 |     --dataloader_num_workers 16 \
58 |     --lazy_preprocess True \
59 |     --report_to wandb \
60 |     --torch_compile True \
61 |     --torch_compile_backend "inductor" \
62 |     --dataloader_drop_last True \
63 |     --attn_implementation sdpa
64 | 
65 | # You can delete the sdpa attn_implementation if you want to use flash attn
66 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/imagebind.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPImageProcessor
 5 | 
 6 | try:
 7 |     from imagebind.models import imagebind_model
 8 |     from imagebind.models.imagebind_model import ModalityType
 9 |     from imagebind.data import load_and_transform_audio_data
10 | except ImportError:
11 |     pass
12 | 
13 | 
14 | class ImageBindWrapper(nn.Module):
15 |     def __init__(self, vision_tower, select_layer, select_feature="patch", delay_load=False):
16 |         super().__init__()
17 | 
18 |         self.is_loaded = False
19 | 
20 |         self.vision_tower_name = vision_tower
21 |         self.select_layer = select_layer
22 |         self.select_feature = select_feature
23 | 
24 |         if not delay_load:
25 |             self.load_model()
26 | 
27 |     def load_model(self):
28 |         self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
29 |         self.vision_tower = imagebind_model.imagebind_huge(pretrained=True)
30 |         for p in self.vision_tower.parameters():
31 |             p.requires_grad = False
32 |         self.vision_tower.eval()
33 |         self.is_loaded = True
34 | 
35 |     def train(self, mode=True):
36 |         self.training = mode
37 | 
38 |         if self.is_loaded:
39 |             self.vision_tower.eval()
40 | 
41 |     @torch.no_grad()
42 |     def forward(self, x):
43 |         if type(x) == dict:
44 |             if x["audios"] is not None:
45 |                 inputs = {ModalityType.AUDIO: load_and_transform_audio_data(x["audios"], device=self.device).half()}
46 |                 embeddings = self.vision_tower(inputs)
47 |                 audio_embedding = embeddings[ModalityType.AUDIO]
48 |                 return audio_embedding.unsqueeze(1)
49 |         else:
50 |             inputs = {ModalityType.VISION: x.to(dtype=self.dtype)}
51 |             embeddings = self.vision_tower(inputs)
52 |             vision_embedding = embeddings[ModalityType.VISION]
53 |             if vision_embedding.ndim == 2:
54 |                 return vision_embedding.unsqueeze(1)
55 |             if vision_embedding.shape[1] == 257:
56 |                 return vision_embedding[:, 1:]
57 |             raise ValueError(f"Unexpected shape: {vision_embedding.shape}")
58 | 
59 |     @property
60 |     def dummy_feature(self):
61 |         return torch.zeros(1, 1024, device=self.device, dtype=self.dtype)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return self.vision_tower.modality_preprocessors.vision.cls_token.dtype
66 | 
67 |     @property
68 |     def device(self):
69 |         return self.vision_tower.modality_preprocessors.vision.cls_token.device
70 | 
71 |     @property
72 |     def hidden_size(self):
73 |         return 1024
74 | 


--------------------------------------------------------------------------------
/scripts/archived/train/direct_finetune_siglip_a4.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="Qwen/Qwen2-7B-Instruct"
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION="qwen_1_5"
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
20 | 
21 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
22 |     llava/train/train_mem.py \
23 |     --deepspeed scripts/zero3.json \
24 |     --model_name_or_path ${CKPT_PATH} \
25 |     --version ${PROMPT_VERSION} \
26 |     --data_path=llava_1_6.json \
27 |     --image_folder your_image_folder \
28 |     --pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
29 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
30 |     --mm_vision_tower_lr=2e-6 \
31 |     --vision_tower ${VISION_MODEL_VERSION} \
32 |     --mm_projector_type mlp2x_gelu \
33 |     --mm_vision_select_layer -2 \
34 |     --mm_use_im_start_end False \
35 |     --mm_use_im_patch_token False \
36 |     --group_by_modality_length True \
37 |     --image_aspect_ratio anyres \
38 |     --image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
39 |     --mm_patch_merge_type spatial_unpad \
40 |     --bf16 True \
41 |     --run_name $MID_RUN_NAME \
42 |     --output_dir "/checkpoints/${MID_RUN_NAME}" \
43 |     --num_train_epochs 1 \
44 |     --per_device_train_batch_size 4 \
45 |     --per_device_eval_batch_size 4 \
46 |     --gradient_accumulation_steps 1 \
47 |     --evaluation_strategy "no" \
48 |     --save_strategy "steps" \
49 |     --save_steps 3000 \
50 |     --save_total_limit 1 \
51 |     --learning_rate 1e-5 \
52 |     --weight_decay 0. \
53 |     --warmup_ratio 0.03 \
54 |     --lr_scheduler_type "cosine" \
55 |     --logging_steps 1 \
56 |     --tf32 True \
57 |     --model_max_length 32768 \
58 |     --gradient_checkpointing True \
59 |     --dataloader_num_workers 16 \
60 |     --lazy_preprocess True \
61 |     --report_to wandb \
62 |     --torch_compile True \
63 |     --torch_compile_backend "inductor" \
64 |     --dataloader_drop_last True \
65 |     --attn_implementation sdpa
66 | 
67 | # You can delete the sdpa attn_implementation if you want to use flash attn
68 | 


--------------------------------------------------------------------------------
/scripts/archived/train/dpo.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
 5 | export NCCL_SOCKET_IFNAME=eth0
 6 | export NCCL_DEBUG=INFO
 7 | 
 8 | VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
 9 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
10 | 
11 | ############### Pretrain ################
12 | 
13 | # Stage 2
14 | PROMPT_VERSION="qwen_1_5"
15 | 
16 | #torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
17 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
18 |     llava/train/train_dpo.py \
19 |     --deepspeed scripts/zero3.json \
20 |     --model_name_or_path lmms-lab/LongVA-7B \
21 |     --version $PROMPT_VERSION \
22 |     --dpo_alpha 1.0 --beta 0.1 --gamma 0 \
23 |     --data_path="/data/llava_video/shareVideoGPTV/dpo/sft_dpo_17k.jsonl" \
24 |     --image_folder /data/llava_data \
25 |     --video_folder /llava_video/shareVideoGPTV/frames/all_frames/ \
26 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
27 |     --vision_tower ${VISION_MODEL_VERSION} \
28 |     --mm_projector_type mlp2x_gelu \
29 |     --mm_vision_select_layer -2 \
30 |     --mm_use_im_start_end False \
31 |     --mm_use_im_patch_token False \
32 |     --mm_spatial_pool_stride 2 \
33 |     --mm_resampler_type "spatial_pool" \
34 |     --mm_spatial_pool_out_channels 1024 \
35 |     --group_by_modality_length True \
36 |     --image_aspect_ratio anyres \
37 |     --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
38 |     --mm_patch_merge_type unires \
39 |     --bf16 True \
40 |     --run_name $MID_RUN_NAME \
41 |     --output_dir "/checkpoints/${MID_RUN_NAME}" \
42 |     --num_train_epochs 3 \
43 |     --per_device_train_batch_size 1 \
44 |     --per_device_eval_batch_size 4 \
45 |     --gradient_accumulation_steps 16 \
46 |     --evaluation_strategy "no" \
47 |     --save_strategy "steps" \
48 |     --save_steps 3000 \
49 |     --save_total_limit 1 \
50 |     --learning_rate 5e-7 \
51 |     --weight_decay 0. \
52 |     --warmup_ratio 0.1 \
53 |     --lr_scheduler_type "linear" \
54 |     --logging_steps 1 \
55 |     --tf32 True \
56 |     --model_max_length 32768 \
57 |     --gradient_checkpointing True \
58 |     --dataloader_num_workers 16 \
59 |     --lazy_preprocess True \
60 |     --report_to wandb \
61 |     --torch_compile True \
62 |     --torch_compile_backend "inductor" \
63 |     --dataloader_drop_last True \
64 |     --attn_implementation sdpa


--------------------------------------------------------------------------------
/scripts/mira_train/pretrain_clip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="meta-llama/Meta-Llama-3-8B"
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION=plain
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
20 | # deepspeed llava/train/train_mem.py \
21 | #     --deepspeed scripts/zero2.json \
22 | python -m pdb llava/train/train_mem.py \
23 |     --model_name_or_path ${LLM_VERSION} \
24 |     --version ${PROMPT_VERSION} \
25 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/alignment/llava_med_alignment_500k.json \
26 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
27 |     --vision_tower ${VISION_MODEL_VERSION} \
28 |     --mm_tunable_parts="mm_mlp_adapter" \
29 |     --mm_vision_select_layer -2 \
30 |     --mm_projector_type mlp2x_gelu \
31 |     --mm_use_im_start_end False \
32 |     --mm_use_im_patch_token False \
33 |     --rag_enabled True \
34 |     --rag_idx /home/jinhong.wang/workdir/database_rag/rag_indexv2.1_0203.idx \
35 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/rag_metadatav2.1_0203.csv \
36 |     --rag_tokenizer all-MiniLM-L6-v2 \
37 |     --rag_topk 5 \
38 |     --query_rewrite_enabled False \
39 |     --query_rewrite_host http://10.127.104.16:11434/api/chat \
40 |     --query_rewrite_model mistral:latest \
41 |     --bf16 True \
42 |     --output_dir /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-llama3-clip-8b \
43 |     --num_train_epochs 1 \
44 |     --per_device_train_batch_size 2 \
45 |     --per_device_eval_batch_size 4 \
46 |     --gradient_accumulation_steps 32 \
47 |     --evaluation_strategy "no" \
48 |     --save_strategy "no" \
49 |     --save_steps 50000 \
50 |     --learning_rate 1e-3 \
51 |     --weight_decay 0. \
52 |     --warmup_ratio 0.03 \
53 |     --lr_scheduler_type "cosine" \
54 |     --logging_steps 1 \
55 |     --tf32 True \
56 |     --model_max_length 8192 \
57 |     --gradient_checkpointing True \
58 |     --dataloader_num_workers 16 \
59 |     --lazy_preprocess True \
60 |     --report_to wandb \
61 |     --run_name $BASE_RUN_NAME \
62 |     --attn_implementation sdpa
63 | 
64 | # You can delete the sdpa attn_implementation if you want to use flash attn


--------------------------------------------------------------------------------
/scripts/archived/train/pt_clip2.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eno1
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | # LLM_VERSION="Qwen/Qwen2-7B-Instruct"
 8 | LLM_VERSION="meta-llama/Meta-Llama-3-8B"
 9 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
10 | VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
11 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
12 | 
13 | ############### Pretrain ################
14 | 
15 | PROMPT_VERSION=plain
16 | 
17 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
18 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
19 | 
20 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
21 | # --deepspeed scripts/zero3.json \ 
22 | deepspeed llava/train/train_mem.py \
23 |     --deepspeed scripts/zero2.json \
24 |     --model_name_or_path ${LLM_VERSION} \
25 |     --version ${PROMPT_VERSION} \
26 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/alignment/llava_med_alignment_500k.json \
27 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
28 |     --vision_tower ${VISION_MODEL_VERSION} \
29 |     --mm_tunable_parts="mm_mlp_adapter" \
30 |     --mm_vision_select_layer -2 \
31 |     --mm_projector_type mlp2x_gelu \
32 |     --mm_use_im_start_end False \
33 |     --mm_use_im_patch_token False \
34 |     --rag_enabled True \
35 |     --rag_idx /home/jinhong.wang/workdir/database_rag/rag_indexv2.1_0203.idx \
36 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/rag_metadatav2.1_0203.csv \
37 |     --rag_tokenizer all-MiniLM-L6-v2 \
38 |     --rag_topk 5 \
39 |     --query_rewrite_enabled False \
40 |     --query_rewrite_host http://10.127.104.16:11434/api/chat \
41 |     --query_rewrite_model mistral:latest \
42 |     --bf16 True \
43 |     --output_dir /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-llama3-8b \
44 |     --num_train_epochs 1 \
45 |     --per_device_train_batch_size 2 \
46 |     --per_device_eval_batch_size 4 \
47 |     --gradient_accumulation_steps 32 \
48 |     --evaluation_strategy "no" \
49 |     --save_strategy "no" \
50 |     --save_steps 50000 \
51 |     --learning_rate 1e-3 \
52 |     --weight_decay 0. \
53 |     --warmup_ratio 0.03 \
54 |     --lr_scheduler_type "cosine" \
55 |     --logging_steps 1 \
56 |     --tf32 True \
57 |     --model_max_length 8192 \
58 |     --gradient_checkpointing True \
59 |     --dataloader_num_workers 0 \
60 |     --lazy_preprocess True \
61 |     --report_to wandb \
62 |     --run_name $BASE_RUN_NAME \
63 |     --attn_implementation sdpa
64 | 
65 | # You can delete the sdpa attn_implementation if you want to use flash attn


--------------------------------------------------------------------------------
/scripts/mira_train/pretrain_biomedclip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eno1
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | # LLM_VERSION="Qwen/Qwen2-7B-Instruct"
 8 | LLM_VERSION="meta-llama/Meta-Llama-3-8B"
 9 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
10 | VISION_MODEL_VERSION="biomedclip"
11 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
12 | 
13 | ############### Pretrain ################
14 | 
15 | PROMPT_VERSION=plain
16 | 
17 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
18 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
19 | 
20 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
21 | # --deepspeed scripts/zero3.json \ 
22 | 
23 | # python llava/train/train_mem.py \
24 | deepspeed llava/train/train_mem.py \
25 |     --deepspeed scripts/zero2.json \
26 |     --model_name_or_path ${LLM_VERSION} \
27 |     --version ${PROMPT_VERSION} \
28 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/alignment/llava_med_alignment_500k.json \
29 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
30 |     --vision_tower ${VISION_MODEL_VERSION} \
31 |     --mm_tunable_parts="mm_mlp_adapter" \
32 |     --mm_vision_select_layer -2 \
33 |     --mm_projector_type mlp2x_gelu \
34 |     --mm_use_im_start_end False \
35 |     --mm_use_im_patch_token False \
36 |     --rag_enabled True \
37 |     --rag_idx /home/jinhong.wang/workdir/database_rag/rag_indexv2.1_0203.idx \
38 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/rag_metadatav2.1_0203.csv \
39 |     --rag_tokenizer all-MiniLM-L6-v2 \
40 |     --rag_topk 5 \
41 |     --query_rewrite_enabled False \
42 |     --query_rewrite_host http://10.127.104.16:11434/api/chat \
43 |     --query_rewrite_model mistral:latest \
44 |     --bf16 True \
45 |     --output_dir /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-biomedclip-llama3-8b \
46 |     --num_train_epochs 1 \
47 |     --per_device_train_batch_size 2 \
48 |     --per_device_eval_batch_size 4 \
49 |     --gradient_accumulation_steps 32 \
50 |     --evaluation_strategy "no" \
51 |     --save_strategy "no" \
52 |     --save_steps 50000 \
53 |     --learning_rate 1e-3 \
54 |     --weight_decay 0. \
55 |     --warmup_ratio 0.03 \
56 |     --lr_scheduler_type "cosine" \
57 |     --logging_steps 1 \
58 |     --tf32 True \
59 |     --model_max_length 8192 \
60 |     --gradient_checkpointing True \
61 |     --dataloader_num_workers 0 \
62 |     --lazy_preprocess True \
63 |     --report_to wandb \
64 |     --run_name $BASE_RUN_NAME \
65 |     --attn_implementation sdpa
66 | 
67 | # You can delete the sdpa attn_implementation if you want to use flash attn


--------------------------------------------------------------------------------
/scripts/mira_train/pretrain_siglip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eno1
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | # LLM_VERSION="Qwen/Qwen2.5-7B-Instruct"
 8 | LLM_VERSION="meta-llama/Meta-Llama-3-8B"
 9 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
10 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
11 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
12 | 
13 | ############### Pretrain ################
14 | 
15 | PROMPT_VERSION=plain
16 | 
17 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
18 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
19 | 
20 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
21 | # --deepspeed scripts/zero3.json \ 
22 | # deepspeed llava/train/train_mem.py \
23 | #     --deepspeed scripts/zero3.json \
24 | python -m pdb llava/train/train_mem.py \
25 |     --model_name_or_path ${LLM_VERSION} \
26 |     --version ${PROMPT_VERSION} \
27 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/alignment/llava_med_alignment_500k.json \
28 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
29 |     --vision_tower ${VISION_MODEL_VERSION} \
30 |     --mm_tunable_parts="mm_mlp_adapter" \
31 |     --mm_vision_select_layer -2 \
32 |     --mm_projector_type mlp2x_gelu \
33 |     --mm_use_im_start_end False \
34 |     --mm_use_im_patch_token False \
35 |     --rag_enabled True \
36 |     --rag_idx /home/jinhong.wang/workdir/database_rag/rag_indexv2.1_0203.idx \
37 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/rag_metadatav2.1_0203.csv \
38 |     --rag_tokenizer all-MiniLM-L6-v2 \
39 |     --rag_topk 5 \
40 |     --query_rewrite_enabled False \
41 |     --query_rewrite_host http://10.127.104.16:11434/api/chat \
42 |     --query_rewrite_model mistral:latest \
43 |     --bf16 True \
44 |     --output_dir /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-qwen2.5ins-7b \
45 |     --num_train_epochs 1 \
46 |     --per_device_train_batch_size 2 \
47 |     --per_device_eval_batch_size 4 \
48 |     --gradient_accumulation_steps 32 \
49 |     --evaluation_strategy "no" \
50 |     --save_strategy "no" \
51 |     --save_steps 50000 \
52 |     --learning_rate 1e-3 \
53 |     --weight_decay 0. \
54 |     --warmup_ratio 0.03 \
55 |     --lr_scheduler_type "cosine" \
56 |     --logging_steps 1 \
57 |     --tf32 True \
58 |     --model_max_length 8192 \
59 |     --gradient_checkpointing True \
60 |     --dataloader_num_workers 0 \
61 |     --lazy_preprocess False \
62 |     --report_to wandb \
63 |     --run_name $BASE_RUN_NAME \
64 |     --attn_implementation sdpa
65 | 
66 | # You can delete the sdpa attn_implementation if you want to use flash attn


--------------------------------------------------------------------------------
/data-factory/data-merge/sample_images.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | from PIL import Image
 4 | import math
 5 | 
 6 | def create_image_collage(input_folder, output_path, sample_size=100, aspect_ratio=(16, 9)):
 7 |     # 获取文件夹内所有图片文件
 8 |     image_files = [f for f in os.listdir(input_folder) 
 9 |                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
10 |     
11 |     # 如果图片数量少于sample_size，就用全部图片
12 |     sample_size = min(sample_size, len(image_files))
13 |     if sample_size == 0:
14 |         print("文件夹中没有找到图片！")
15 |         return
16 |     
17 |     # 随机抽取图片
18 |     selected_images = random.sample(image_files, sample_size)
19 |     
20 |     # 计算网格布局
21 |     total_images = len(selected_images)
22 |     aspect_width, aspect_height = aspect_ratio
23 |     # 估算每行每列的大约图片数
24 |     cols = int(math.sqrt(total_images * aspect_width / aspect_height))
25 |     rows = math.ceil(total_images / cols)
26 |     
27 |     # 加载所有图片并获取最大尺寸
28 |     images = []
29 |     max_width = 0
30 |     max_height = 0
31 |     
32 |     for img_file in selected_images:
33 |         try:
34 |             img_path = os.path.join(input_folder, img_file)
35 |             img = Image.open(img_path).convert('RGB')
36 |             images.append(img)
37 |             max_width = max(max_width, img.width)
38 |             max_height = max(max_height, img.height)
39 |         except Exception as e:
40 |             print(f"无法加载图片 {img_file}: {e}")
41 |             continue
42 |     
43 |     if not images:
44 |         print("没有成功加载任何图片！")
45 |         return
46 |     
47 |     # 计算输出图片的尺寸
48 |     output_width = max_width * cols
49 |     output_height = max_height * rows
50 |     
51 |     # 创建空白画布
52 |     collage = Image.new('RGB', (output_width, output_height), (255, 255, 255))
53 |     
54 |     # 将图片粘贴到画布上
55 |     for idx, img in enumerate(images):
56 |         # 计算当前图片的位置
57 |         row = idx // cols
58 |         col = idx % cols
59 |         
60 |         # 调整图片大小以适应格子
61 |         img_resized = img.resize((max_width, max_height), Image.Resampling.LANCZOS)
62 |         
63 |         # 计算粘贴位置
64 |         x = col * max_width
65 |         y = row * max_height
66 |         
67 |         # 粘贴图片
68 |         collage.paste(img_resized, (x, y))
69 |     
70 |     # 调整最终图片到16:9比例
71 |     target_width = 1920  # 可以调整这个值来改变输出分辨率
72 |     target_height = int(target_width * aspect_height / aspect_width)
73 |     final_collage = collage.resize((target_width, target_height), Image.Resampling.LANCZOS)
74 |     
75 |     # 保存结果
76 |     final_collage.save(output_path, quality=95)
77 |     print(f"拼接图已保存到: {output_path}")
78 | 
79 | # 使用示例
80 | if __name__ == "__main__":
81 |     # 设置输入文件夹和输出路径
82 |     input_folder = "/Users/moonshot/Documents/清洗MRAG/it_imfiles"  # 替换为你的图片文件夹路径
83 |     output_path = "collage_output.jpg"       # 输出文件名
84 |     
85 |     # 创建拼接图
86 |     create_image_collage(input_folder, output_path, sample_size=104)


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral_1.6_336px_anyres_lmms_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set up wandb
 4 | export WANDB_API_KEY=a651c244635bc6f913ab654af3f0eebaecdc9381
 5 | export WANDB_ENTITY=llava-vl
 6 | export WANDB_PROJECT=llava-next
 7 | export PYTHONWARNINGS="ignore"
 8 | 
 9 | cd /mnt/bn/vl-research/workspace/boli01/projects/lmms-eval
10 | 
11 | pip install -e .
12 | 
13 | # set up llava dev env
14 | cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
15 | 
16 | ################## MISTRAL ##################
17 | PROMPT_VERSION=mistral_instruct
18 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
19 | ################## MISTRAL ##################
20 | 
21 | ################## project ##################
22 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
23 | 
24 | ################## data ##################
25 | DATA_NAME='llava_caps20k_chartqa19k'
26 | 
27 | export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
28 | export WANDB_MODE=online
29 | 
30 | wandb online
31 | 
32 | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" deepspeed --master_port 26000 --include localhost:0,1,2,3,4,5,6,7 llava/train/train_mem.py \
33 |     --deepspeed ./scripts/zero3_offload.json \
34 |     --model_name_or_path mistralai/$MODEL_VERSION \
35 |     --version $PROMPT_VERSION \
36 |     --data_path ./playground/data/llava_instruct/$DATA_NAME.json \
37 |     --image_folder /mnt/bn/vl-research/data/llava \
38 |     --vision_tower openai/clip-vit-large-patch14-336 \
39 |     --mm_projector_type mlp2x_gelu \
40 |     --mm_vision_select_layer -2 \
41 |     --mm_use_im_start_end False \
42 |     --mm_use_im_patch_token False \
43 |     --group_by_modality_length True \
44 |     --unfreeze_mm_vision_tower True \
45 |     --mm_vision_tower_lr 2e-6 \
46 |     --image_aspect_ratio anyres \
47 |     --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
48 |     --mm_patch_merge_type spatial_unpad \
49 |     --bf16 True \
50 |     --output_dir ./checkpoints/$PROJECT_NAME--llava1.6--336px--anyres--sft \
51 |     --num_train_epochs 1 \
52 |     --per_device_train_batch_size 8 \
53 |     --per_device_eval_batch_size 4 \
54 |     --gradient_accumulation_steps 1 \
55 |     --evaluation_strategy "no" \
56 |     --save_strategy "steps" \
57 |     --save_steps 1500 \
58 |     --learning_rate 2e-5 \
59 |     --weight_decay 0. \
60 |     --warmup_ratio 0.03 \
61 |     --lr_scheduler_type "cosine" \
62 |     --logging_steps 1 \
63 |     --tf32 True \
64 |     --model_max_length 4096 \
65 |     --gradient_checkpointing True \
66 |     --dataloader_num_workers 32 \
67 |     --lazy_preprocess True \
68 |     --report_to wandb \
69 |     --run_name $WANDB_NAME
70 | # starting here is the args for evaluation
71 |     --eval_num_processes 4 \ 
72 |     --task_names mme,docvqa_val \
73 |     --model_args pretrained=./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
74 |     --limit 8 \
75 |     --batch_size 1 \
76 |     --log_samples \
77 |     --log_samples_suffix debug \
78 |     --output_path ./logs/
79 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/dpo_data_info.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | 
 4 | json_path = "/mnt/bn/vl-research/workspace/boli01/projects/sft_data_workspace/vlfeedback_80k.jsonl"
 5 | 
 6 | with open(json_path, "r") as f:
 7 |     data = f.readlines()
 8 | 
 9 | data = [json.loads(d) for d in data]
10 | 
11 | 
12 | def convert_format(original_data, dimension="Visual Faithfulness"):
13 |     converted_data = []
14 |     for item in original_data:
15 |         # Assuming the best response is the one with the highest helpfulness rating
16 |         best_completion = max(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"]))
17 |         best_response = best_completion["response"]
18 |         best_model = best_completion["model"]
19 | 
20 |         if "†source" in best_response:
21 |             print(best_response)
22 |             # Regex pattern to match the pattern 【digit†source】
23 |             pattern = r"【\d+†source】"
24 |             # Replace the matched patterns with an empty string
25 |             cleaned_text = re.sub(pattern, "", best_response)
26 |             best_response = cleaned_text
27 |             print(f"*****************************************")
28 |             print(best_response)
29 | 
30 |         # Assuming the worst response is the one with the lowest helpfulness rating
31 |         worst_completion = min(item["completions"], key=lambda x: int(x["annotations"]["Helpfulness"]["Rating"]))
32 |         worst_response = worst_completion["response"]
33 | 
34 |         if "†source" in worst_response:
35 |             print(worst_response)
36 |             # Regex pattern to match the pattern ��digit†source】
37 |             pattern = r"【\d+†source】"
38 |             # Replace the matched patterns with an empty string
39 |             cleaned_text = re.sub(pattern, "", worst_response)
40 |             worst_response = cleaned_text
41 |             print(f"*****************************************")
42 |             print(worst_response)
43 | 
44 |         # Extract scores
45 |         best_score = int(best_completion["annotations"][dimension]["Rating"])
46 |         worst_score = int(worst_completion["annotations"][dimension]["Rating"])
47 | 
48 |         # Construct the new format
49 |         new_item = {
50 |             "id": item["id"],
51 |             "prompt": item["prompt"],
52 |             "answer": "",
53 |             "image": f"silkie_dpo/{item['id']}.jpg",  # Assuming the video ID is the last part of the original ID
54 |             "chosen": best_response,
55 |             "rejected": worst_response,
56 |             "chosen_score": best_score,
57 |             "rejected_score": worst_score,
58 |         }
59 |         converted_data.append(new_item)
60 | 
61 |     return converted_data
62 | 
63 | 
64 | for dimension in ["Visual Faithfulness", "Helpfulness", "Ethical Considerations"]:
65 |     converted_data = convert_format(data, dimension=dimension)
66 |     with open(f"/mnt/bn/vl-research/data/llava_instruct/dpo_data/silkie_dpo_data_{dimension.replace(' ', '_').lower()}_{len(converted_data)}.json", "w") as f:
67 |         json.dump(converted_data, f, indent=4)
68 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .eva_clip_processors import EvaClipImageTrainProcessor
 5 | from .eva_vit import EVAEncoderWrapper
 6 | from .factory import list_models, add_model_config, get_model_config
 7 | 
 8 | from llava.utils import rank0_print
 9 | 
10 | 
11 | class EvaClipVisionTower(nn.Module):
12 |     def __init__(self, vision_tower, args, delay_load=False):
13 |         super().__init__()
14 | 
15 |         self.is_loaded = False
16 |         self.vision_tower_name = vision_tower
17 |         self.vision_tower_pretrained = args.vision_tower_pretrained
18 |         self.config = get_model_config(vision_tower)
19 | 
20 |         if not delay_load:
21 |             rank0_print(f"Loading EVA ViT: {self.vision_tower_name}")
22 |             self.load_model()
23 |         elif getattr(args, "unfreeze_mm_vision_tower", False):
24 |             # TODO: better detector is needed.
25 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
26 |             self.load_model()
27 |         elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
28 |             rank0_print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
29 |             self.load_model()
30 |         else:
31 |             self.cfg_only = self.config
32 | 
33 |     def load_model(self, device_map=None):
34 |         rank0_print(f"Pretrained: {self.vision_tower_pretrained}")
35 |         self.image_processor = EvaClipImageTrainProcessor(self.config["vision_cfg"]["image_size"])
36 |         self.vision_tower = EVAEncoderWrapper(self.vision_tower_pretrained, self.config)
37 |         rank0_print(f"Loaded image processor: {self.image_processor}")
38 |         self.vision_tower.requires_grad_(False)
39 |         self.is_loaded = True
40 | 
41 |     def forward(self, images):
42 |         if type(images) is list:
43 |             image_features = []
44 |             for image in images:
45 |                 image_feature = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)).to(image.dtype)
46 |                 image_features.append(image_feature)
47 |         else:
48 |             image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(images.dtype)
49 | 
50 |         return image_features
51 | 
52 |     @property
53 |     def dtype(self):
54 |         return self.vision_tower.dtype
55 | 
56 |     @property
57 |     def device(self):
58 |         return self.vision_tower.device
59 | 
60 |     @property
61 |     def hidden_size(self):
62 |         return self.config["vision_cfg"]["width"]
63 | 
64 |     @property
65 |     def num_patches(self):
66 |         return (self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]) ** 2
67 | 
68 |     @property
69 |     def num_patches_per_side(self):
70 |         return self.config["vision_cfg"]["image_size"] // self.config["vision_cfg"]["patch_size"]
71 | 
72 |     @property
73 |     def image_size(self):
74 |         return self.config["vision_cfg"]["image_size"]
75 | 


--------------------------------------------------------------------------------
/scripts/archived/summarize_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from tqdm import tqdm
 4 | 
 5 | with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k.json") as f:
 6 |     llava_v1_5_mix665k = json.load(f)  # 665298
 7 | 
 8 | with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_instruct_150k.json") as f:
 9 |     llava_instruct_150k = json.load(f)  # 157712
10 | 
11 | # Create sets of "id" fields
12 | mix665k_ids = set()
13 | for item in llava_v1_5_mix665k:
14 |     all_conv = ""
15 |     for cur_conversation in item["conversations"]:
16 |         all_conv += cur_conversation["value"]
17 |     mix665k_ids.add(f'{item["id"]}_{all_conv}')
18 | 
19 | instruct_150k_ids = set()
20 | for item in llava_instruct_150k:
21 |     all_conv = ""
22 |     for cur_conversation in item["conversations"]:
23 |         all_conv += cur_conversation["value"]
24 |     instruct_150k_ids.add(f'{item["id"]}_{all_conv}')
25 | 
26 | share_gpt_ids = set()
27 | for item in llava_v1_5_mix665k:
28 |     if "image" not in item:
29 |         all_conv = ""
30 |         for cur_conversation in item["conversations"]:
31 |             all_conv += cur_conversation["value"]
32 |         share_gpt_ids.add(f'{item["id"]}_{all_conv}')  # 40688
33 | 
34 | # Get "id" fields that are in mix665k but not in instruct_150k and share_gpt
35 | new_ids = mix665k_ids - instruct_150k_ids - share_gpt_ids  # 466898
36 | 
37 | # Get "id" fields that are in mix665k but not in share_gpt
38 | # new_ids = mix665k_ids - share_gpt_ids #624610
39 | 
40 | # import pdb; pdb.set_trace()
41 | 
42 | # Filter mix665k data based on new_ids
43 | new_data = []
44 | for item in llava_v1_5_mix665k:
45 |     all_conv = ""
46 |     for cur_conversation in item["conversations"]:
47 |         all_conv += cur_conversation["value"]
48 |     if f'{item["id"]}_{all_conv}' in new_ids:
49 |         new_data.append(item)
50 | 
51 | import pdb
52 | 
53 | pdb.set_trace()
54 | 
55 | with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/mixtral_instruct_135K_of_158K_V1.5.json") as f:
56 |     new_mixtral_instruct = json.load(f)
57 | 
58 | # mixtral_instruct_50K_of_80K_V1.json@
59 | 
60 | # print(len(new_data))
61 | # for _ in new_mixtral_instruct:
62 | #     # import pdb; pdb.set_trace()
63 | #     if "coco" not in _["image"]:
64 | #         _["image"] = f"coco/train2017/{_['image']}"
65 | #     new_data.append(_)
66 | 
67 | # print(len(instruct_150k_ids))
68 | print(len(new_data))
69 | 
70 | # for _ in tqdm(new_data):
71 | #     if "image" in _:
72 | #         if "000000442654" in _["image"]:
73 | #             all_conv = ""
74 | #             for cur_conversation in _["conversations"]:
75 | #                 all_conv += cur_conversation["value"]
76 | #         # if not os.path.exists(f'/mnt/bn/vl-research/workspace/boli01/data/playground/data/{_["image"]}'):
77 | #             import pdb; pdb.set_trace()
78 | 
79 | # Write new_data to a new JSON file
80 | with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k_minus_llava_instruct_150k_minus_sharegpt_plus_mixtral_instruct_135K_of_158K_V1.5.json", "w") as f:
81 |     json.dump(new_data, f)
82 | 


--------------------------------------------------------------------------------
/scripts/archived/video/eval/activitynet_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR="root to LLaVA-NeXT-Video"
 3 | 
 4 | if [ ! -e $ROOT_DIR ]; then
 5 |     echo "The root dir does not exist. Exiting the script."
 6 |     exit 1
 7 | fi
 8 | 
 9 | cd $ROOT_DIR
10 | 
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
14 | gpu_list="${CUDA_VISIBLE_DEVICES}"
15 | GPULIST=(${(s:,:)gpu_list})
16 | 
17 | CHUNKS=${#GPULIST[@]}
18 | echo "Using $CHUNKS GPUs"
19 | 
20 | CKPT=$1
21 | CONV_MODE=$2
22 | FRAMES=$3
23 | OVERWRITE=$4
24 | PREDEFINED_CONFIGURE=$5
25 | mm_spatial_pool_stride=$6
26 | MODEL_MAX_LENGTH=${7:-0}
27 | 
28 | CKPT=$1
29 | CONV_MODE=$2
30 | FRAMES=$3
31 | POOL_STRIDE=$4
32 | OVERWRITE=$5
33 | CHUNKS=${6:-1}
34 | 
35 | PATCHIFY=False
36 | 
37 | 
38 | OPENAIKEY="INPUT YOUR OPENAI API"
39 | 
40 | 
41 | if [ "$OVERWRITE" = False ]; then
42 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
43 | 
44 | else
45 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
46 | fi
47 | 
48 | echo $SAVE_DIR
49 | 
50 | # for IDX in {1..$CHUNKS}; do
51 | #     GPU_ID=${GPULIST[$IDX]}  # Note: Zsh arrays are 1-indexed by default
52 | 
53 | #     # GPU_FREE=0
54 | #     # while [ $GPU_FREE -eq 0 ]; do
55 | #     #     # Using nvidia-smi to get the memory usage of the GPU with ID $GPU_ID
56 | #     #     # Parsing the output to extract the memory usage, and checking if it is "0"
57 | #     #     MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
58 | 
59 | #     #     if [ "$MEM_USAGE" -eq 0 ]; then
60 | #     #         GPU_FREE=1
61 | #     #         echo "GPU $GPU_ID is free."
62 | #     #     else
63 | #     #         echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB. Checking again in 100 seconds..."
64 | #     #         sleep 100
65 | #     #     fi
66 | #     # done
67 | 
68 | #     echo "Running on GPU $GPU_ID"
69 | #     CUDA_VISIBLE_DEVICES=$GPU_ID python3 llavavid/eval/model_activitynet_qa.py \
70 | #     --model-path $CKPT \
71 | #     --video_dir ./data/llava_video/ActivityNet-QA/all_test \
72 | #     --gt_file_question ./data/llava_video/ActivityNet-QA/test_q.json \
73 | #     --gt_file_answers ./data/llava_videoActivityNet-QA/test_a.json \
74 | #     --output_dir ./work_dirs/eval_activitynet/$SAVE_DIR \
75 | #     --output_name pred \
76 | #     --num-chunks $CHUNKS \
77 | #     --chunk-idx $(($IDX - 1)) \
78 | #     --overwrite ${OVERWRITE} \
79 | #     --patchify_video_feature ${PATCHIFY} \
80 | #     --predefined_configure ${PREDEFINED_CONFIGURE} \
81 | #     --mm_spatial_pool_stride ${mm_spatial_pool_stride:-4} \
82 | #     --for_get_frames_num $FRAMES \
83 | #     --model-max-length ${MODEL_MAX_LENGTH:-0} \
84 | #     --conv-mode $CONV_MODE &
85 | 
86 | # done
87 | 
88 | # wait
89 | 
90 | python3 llava/eval/eval_activitynet_qa.py \
91 |     --pred_path ./work_dirs/eval_activitynet/$SAVE_DIR \
92 |     --output_dir ./work_dirs/eval_activitynet/$SAVE_DIR/results \
93 |     --output_json ./work_dirs/eval_activitynet/$SAVE_DIR/results.json \
94 |     --num_chunks $CHUNKS \
95 |     --api_key $OPENAIKEY \
96 |     # --num_tasks 16 \


--------------------------------------------------------------------------------
/scripts/archived/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set up the data folder
 4 | IMAGE_FOLDER="XXX"
 5 | VIDEO_FOLDER="XXX"
 6 | DATA_YAML="XXX" # e.g exp.yaml
 7 | 
 8 | ############### Prepare Envs #################
 9 | python3 -m pip install flash-attn --no-build-isolation
10 | alias python=python3
11 | ############### Show Envs ####################
12 | 
13 | nvidia-smi
14 | 
15 | ################ Arnold Jobs ################
16 | 
17 | LLM_VERSION="Qwen/Qwen2-72B-Instruct"
18 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
19 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
20 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
21 | 
22 | 
23 | BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-72B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
24 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
25 | 
26 | # Stage 2
27 | PROMPT_VERSION="qwen_1_5"
28 | MID_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_to_video_am9"
29 | PREV_STAGE_CHECKPOINT="lmms-lab/llava-onevision-qwen2-72b-ov-si"
30 | echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
31 | echo "MID_RUN_NAME: ${MID_RUN_NAME}"
32 | 
33 | 
34 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
35 | deepspeed --master_port 30000 \
36 |     llava/train/train_mem.py \
37 |     --deepspeed scripts/zero3.json \
38 |     --model_name_or_path $PREV_STAGE_CHECKPOINT \
39 |     --version $PROMPT_VERSION \
40 |     --data_path $DATA_YAML \
41 |     --image_folder $IMAGE_FOLDER \
42 |     --video_folder $VIDEO_FOLDER \
43 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
44 |     --mm_vision_tower_lr=2e-6 \
45 |     --vision_tower ${VISION_MODEL_VERSION} \
46 |     --mm_projector_type mlp2x_gelu \
47 |     --mm_vision_select_layer -2 \
48 |     --mm_use_im_start_end False \
49 |     --mm_use_im_patch_token False \
50 |     --group_by_modality_length True \
51 |     --image_aspect_ratio anyres_max_9 \
52 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
53 |     --mm_patch_merge_type spatial_unpad \
54 |     --bf16 True \
55 |     --run_name $MID_RUN_NAME \
56 |     --output_dir ./work_dirs/$MID_RUN_NAME \
57 |     --num_train_epochs 1 \
58 |     --per_device_train_batch_size 1 \
59 |     --per_device_eval_batch_size 4 \
60 |     --gradient_accumulation_steps 2 \
61 |     --evaluation_strategy "no" \
62 |     --save_strategy "steps" \
63 |     --save_steps 500 \
64 |     --save_total_limit 1 \
65 |     --learning_rate 1e-5 \
66 |     --weight_decay 0. \
67 |     --warmup_ratio 0.03 \
68 |     --lr_scheduler_type "cosine" \
69 |     --logging_steps 1 \
70 |     --tf32 True \
71 |     --model_max_length 32768 \
72 |     --gradient_checkpointing True \
73 |     --dataloader_num_workers 2 \
74 |     --lazy_preprocess True \
75 |     --report_to wandb \
76 |     --torch_compile True \
77 |     --torch_compile_backend "inductor" \
78 |     --dataloader_drop_last True \
79 |     --frames_upbound 32 \
80 |     --mm_newline_position grid \
81 |     --add_time_instruction True \
82 |     --force_sample True \
83 |     --mm_spatial_pool_stride 2
84 | exit 0;


--------------------------------------------------------------------------------
/scripts/archived/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Set up the data folder
 4 | IMAGE_FOLDER="XXX"
 5 | VIDEO_FOLDER="XXX"
 6 | DATA_YAML="XXX" # e.g exp.yaml
 7 | 
 8 | ############### Prepare Envs #################
 9 | python3 -m pip install flash-attn --no-build-isolation
10 | alias python=python3
11 | ############### Show Envs ####################
12 | 
13 | nvidia-smi
14 | 
15 | ################ Arnold Jobs ################
16 | 
17 | LLM_VERSION="Qwen/Qwen2-7B-Instruct"
18 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
19 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
20 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
21 | #
22 | 
23 | BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
24 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
25 | 
26 | # Stage 2
27 | PROMPT_VERSION="qwen_1_5"
28 | MID_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_to_video_am9"
29 | PREV_STAGE_CHECKPOINT="lmms-lab/llava-onevision-qwen2-7b-ov-si"
30 | echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
31 | echo "MID_RUN_NAME: ${MID_RUN_NAME}"
32 | 
33 | 
34 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
35 | deepspeed --master_port 30000 \
36 |     llava/train/train_mem.py \
37 |     --deepspeed scripts/zero3.json \
38 |     --model_name_or_path $PREV_STAGE_CHECKPOINT \
39 |     --version $PROMPT_VERSION \
40 |     --data_path $DATA_YAML \
41 |     --image_folder $IMAGE_FOLDER \
42 |     --video_folder $VIDEO_FOLDER \
43 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
44 |     --mm_vision_tower_lr=2e-6 \
45 |     --vision_tower ${VISION_MODEL_VERSION} \
46 |     --mm_projector_type mlp2x_gelu \
47 |     --mm_vision_select_layer -2 \
48 |     --mm_use_im_start_end False \
49 |     --mm_use_im_patch_token False \
50 |     --group_by_modality_length True \
51 |     --image_aspect_ratio anyres_max_9 \
52 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
53 |     --mm_patch_merge_type spatial_unpad \
54 |     --bf16 True \
55 |     --run_name $MID_RUN_NAME \
56 |     --output_dir ./work_dirs/$MID_RUN_NAME \
57 |     --num_train_epochs 1 \
58 |     --per_device_train_batch_size 1 \
59 |     --per_device_eval_batch_size 4 \
60 |     --gradient_accumulation_steps 2 \
61 |     --evaluation_strategy "no" \
62 |     --save_strategy "steps" \
63 |     --save_steps 500 \
64 |     --save_total_limit 1 \
65 |     --learning_rate 1e-5 \
66 |     --weight_decay 0. \
67 |     --warmup_ratio 0.03 \
68 |     --lr_scheduler_type "cosine" \
69 |     --logging_steps 1 \
70 |     --tf32 True \
71 |     --model_max_length 32768 \
72 |     --gradient_checkpointing True \
73 |     --dataloader_num_workers 2 \
74 |     --lazy_preprocess True \
75 |     --report_to wandb \
76 |     --torch_compile True \
77 |     --torch_compile_backend "inductor" \
78 |     --dataloader_drop_last True \
79 |     --frames_upbound 64 \
80 |     --mm_newline_position grid \
81 |     --add_time_instruction True \
82 |     --force_sample True \
83 |     --mm_spatial_pool_stride 2
84 | exit 0;


--------------------------------------------------------------------------------
/scripts/archived/train/finetune_si.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="Qwen/Qwen2-7B-Instruct" 
 8 | # for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03
 9 | # for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03
10 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
11 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
12 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
13 | 
14 | ############### Pretrain ################
15 | 
16 | BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | ############### Finetune ################
20 | 
21 | # Stage 2
22 | PROMPT_VERSION="qwen_1_5"
23 | RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-si_stage_am9" 
24 | PREV_STAGE_CHECKPOINT="/mnt/bn/vl-research/checkpoints/onevision/xxxxxxxxxxxxxxxx" # replace it with your last checkpoint training from mid stage
25 | echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
26 | echo "MID_RUN_NAME: ${RUN_NAME}"
27 | 
28 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
29 |     llava/train/train_mem.py \
30 |     --deepspeed scripts/zero3.json \
31 |     --model_name_or_path $PREV_STAGE_CHECKPOINT \
32 |     --version $PROMPT_VERSION \
33 |     --data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_3p2m_single_image.yaml \
34 |     --image_folder /mnt/bn/vl-research/data/llava_data \
35 |     --video_folder /mnt/bn/vl-research/data/llava_video \
36 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
37 |     --mm_vision_tower_lr=2e-6 \
38 |     --vision_tower ${VISION_MODEL_VERSION} \
39 |     --mm_projector_type mlp2x_gelu \
40 |     --mm_vision_select_layer -2 \
41 |     --mm_use_im_start_end False \
42 |     --mm_use_im_patch_token False \
43 |     --group_by_modality_length True \
44 |     --image_aspect_ratio anyres_max_9 \
45 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
46 |     --mm_patch_merge_type spatial_unpad \
47 |     --bf16 True \
48 |     --run_name $RUN_NAME \
49 |     --output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \
50 |     --num_train_epochs 1 \
51 |     --per_device_train_batch_size 1 \
52 |     --per_device_eval_batch_size 4 \
53 |     --gradient_accumulation_steps 2 \
54 |     --evaluation_strategy "no" \
55 |     --save_strategy "steps" \
56 |     --save_steps 1000 \
57 |     --save_total_limit 1 \
58 |     --learning_rate 1e-5 \
59 |     --weight_decay 0. \
60 |     --warmup_ratio 0.03 \
61 |     --lr_scheduler_type "cosine" \
62 |     --logging_steps 1 \
63 |     --tf32 True \
64 |     --model_max_length 32768 \
65 |     --gradient_checkpointing True \
66 |     --dataloader_num_workers 4 \
67 |     --lazy_preprocess True \
68 |     --report_to wandb \
69 |     --torch_compile True \
70 |     --torch_compile_backend "inductor" \
71 |     --dataloader_drop_last True \
72 |     --frames_upbound 32
73 | exit 0;
74 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/masked_drop.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import random
 5 | 
 6 | 
 7 | class MaskedDrop(nn.Module):
 8 |     def __init__(self, model_args):
 9 |         super().__init__()
10 | 
11 |         self.mode = model_args.mm_mask_drop_mode
12 |         self.skip_percentage = model_args.mm_mask_drop_skip_percentage
13 |         self.ratio = model_args.mm_mask_drop_ratio
14 |         self.ratio_upper = model_args.mm_mask_drop_ratio_upper
15 |         self.ratio_lower = model_args.mm_mask_drop_ratio_lower
16 | 
17 |     def forward(self, image_features, *args, **kwargs):
18 | 
19 |         if not self.training:
20 |             return image_features
21 | 
22 |         if self.skip_percentage > random.random():
23 |             return image_features
24 | 
25 |         masked_features = []
26 | 
27 |         for image_feature in image_features:
28 |             num_tokens = image_feature.shape[0]
29 |             if self.mode == "fixed":
30 |                 num_keep = int(num_tokens * self.ratio)
31 |                 masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0][0])
32 |             elif self.mode == "range":
33 |                 num_keep = int(num_tokens * random.uniform(self.ratio_lower, self.ratio_upper))
34 |                 masked_features.append(self.random_masking(image_feature.unsqueeze(0), num_keep)[0])
35 |             elif self.mode == "cls_only":
36 |                 masked_features.append(image_feature[0:1])
37 |             else:
38 |                 raise ValueError(f"Unexpected masked drop mode: {self.mode}")
39 | 
40 |         if self.mode not in ["range"] and (type(image_features) is not list or self.mode in ["cls_only"]):
41 |             masked_features = torch.stack(masked_features, dim=0)
42 | 
43 |         return masked_features
44 | 
45 |     @property
46 |     def config(self):
47 |         return {
48 |             "mm_resampler_type": "masked_drop",
49 |             "mm_mask_drop_mode": self.mode,
50 |             "mm_mask_drop_skip_percentage": self.skip_percentage,
51 |             "mm_mask_drop_ratio": self.ratio,
52 |             "mm_mask_drop_ratio_upper": self.ratio_upper,
53 |             "mm_mask_drop_ratio_lower": self.ratio_lower,
54 |         }
55 | 
56 |     def random_masking(self, x, len_keep):
57 |         """
58 |         Perform per-sample random masking by per-sample shuffling.
59 |         Per-sample shuffling is done by argsort random noise.
60 |         x: [N, L, D], sequence
61 |         """
62 |         N, L, D = x.shape  # batch, length, dim
63 | 
64 |         noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
65 | 
66 |         # sort noise for each sample
67 |         ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
68 |         ids_restore = torch.argsort(ids_shuffle, dim=1)
69 | 
70 |         # keep the first subset
71 |         ids_keep = ids_shuffle[:, :len_keep]
72 |         x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
73 | 
74 |         # generate the binary mask: 0 is keep, 1 is remove
75 |         mask = torch.ones([N, L], device=x.device)
76 |         mask[:, :len_keep] = 0
77 |         # unshuffle to get the binary mask
78 |         mask = torch.gather(mask, dim=1, index=ids_restore)
79 | 
80 |         return x_masked, mask, ids_restore
81 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral_1.5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | dataset_name=$1
 3 | 
 4 | cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
 5 | 
 6 | # Install yolk3k if not installed
 7 | if ! pip show yolk3k > /dev/null 2>&1; then
 8 |     pip install yolk3k
 9 | fi
10 | 
11 | # Get the installed version of transformers
12 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
13 | 
14 | # Get the latest version of transformers from PyPI
15 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
16 | 
17 | # Check if the installed version is not the latest
18 | if [ "$installed_version" != "$latest_version" ]; then
19 |     pip install -U transformers
20 | fi
21 | 
22 | # Get the installed version of deepspeed
23 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
24 | 
25 | # Get the latest version of deepspeed from PyPI
26 | # latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
27 | 
28 | # Check if the installed version is not the latest
29 | if [ "$installed_version" != "0.12.2" ]; then
30 |     pip install deepspeed==0.12.2
31 | fi
32 | 
33 | # Install yolk3k if not installed
34 | if ! pip show flash-attn > /dev/null 2>&1; then
35 |     pip install flash-attn --no-build-isolation
36 | fi
37 | 
38 | ################## MISTRAL ##################
39 | PROMPT_VERSION=mistral_instruct
40 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
41 | ################## MISTRAL ##################
42 | 
43 | 
44 | ################## project ##################
45 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
46 | 
47 | ################## data ##################
48 | DATA_NAME=$dataset_name
49 | 
50 | 
51 | # wandb configure
52 | export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
53 | wandb login $WANDB_API_KEY
54 | 
55 | export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
56 | 
57 | export WANDB_PROJECT=LLaVA_Mixtral
58 | 
59 | export WANDB_MODE=online
60 | 
61 | wandb online
62 | 
63 | deepspeed --master_port 26000 \
64 |     llava/train/train_mem.py \
65 |     --deepspeed ./scripts/zero2.json \
66 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
67 |     --version $PROMPT_VERSION \
68 |     --data_path ./playground/data/$DATA_NAME.json \
69 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
70 |     --vision_tower openai/clip-vit-large-patch14 \
71 |     --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
72 |     --mm_vision_select_layer -2 \
73 |     --mm_projector_type mlp2x_gelu \
74 |     --mm_use_im_start_end False \
75 |     --mm_use_im_patch_token False \
76 |     --bf16 True \
77 |     --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
78 |     --num_train_epochs 1 \
79 |     --per_device_train_batch_size 16 \
80 |     --per_device_eval_batch_size 4 \
81 |     --gradient_accumulation_steps 1 \
82 |     --evaluation_strategy "no" \
83 |     --save_strategy "steps" \
84 |     --save_steps 50000 \
85 |     --save_total_limit 1 \
86 |     --learning_rate 2e-5 \
87 |     --weight_decay 0. \
88 |     --warmup_ratio 0.03 \
89 |     --lr_scheduler_type "cosine" \
90 |     --logging_steps 1 \
91 |     --tf32 True \
92 |     --model_max_length 2048 \
93 |     --gradient_checkpointing True \
94 |     --dataloader_num_workers 16 \
95 |     --lazy_preprocess True 
96 |     # --report_to wandb
97 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
 4 | 
 5 | # Install yolk3k if not installed
 6 | if ! pip show yolk3k > /dev/null 2>&1; then
 7 |     pip install yolk3k
 8 | fi
 9 | 
10 | # Get the installed version of transformers
11 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
12 | 
13 | # Get the latest version of transformers from PyPI
14 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
15 | 
16 | # Check if the installed version is not the latest
17 | if [ "$installed_version" != "$latest_version" ]; then
18 |     pip install -U transformers
19 | fi
20 | 
21 | # Get the installed version of deepspeed
22 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
23 | 
24 | # Get the latest version of deepspeed from PyPI
25 | latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
26 | 
27 | # Check if the installed version is not the latest
28 |     # pip install deepspeed==0.12.2
29 | if [ "$installed_version" != "$latest_version" ]; then
30 |     pip install deepspeed==0.12.2
31 | fi
32 | 
33 | # Install flash-attn if not installed
34 | if ! pip show flash-attn > /dev/null 2>&1; then
35 |     pip install flash-attn --no-build-isolation
36 | fi
37 | 
38 | ################## VICUNA ##################
39 | PROMPT_VERSION=v1
40 | MODEL_VERSION="vicuna-7b-v1-5"
41 | ################## VICUNA ##################
42 | 
43 | 
44 | ################## project ##################
45 | PROJECT_NAME="ds_llava-vicuna-7b-v1-5-mlp2x_gelu-pretrain_blip558k_plain"
46 | 
47 | ################## data ##################
48 | DATA_NAME="mixtral_instruct_158K_V1"
49 | 
50 | # wandb configure
51 | export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
52 | wandb login $WANDB_API_KEY
53 | 
54 | export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
55 | 
56 | export WANDB_PROJECT=LLaVA_Mixtral
57 | 
58 | export WANDB_MODE=online
59 | 
60 | # wandb online
61 | 
62 | deepspeed --master_port 26000 \
63 |     llava/train/train_mem.py \
64 |     --deepspeed ./scripts/zero2.json \
65 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
66 |     --version $PROMPT_VERSION \
67 |     --data_path ./playground/data/$DATA_NAME.json \
68 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data/coco/train2017 \
69 |     --vision_tower openai/clip-vit-large-patch14 \
70 |     --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
71 |     --mm_vision_select_layer -2 \
72 |     --mm_projector_type mlp2x_gelu \
73 |     --mm_use_im_start_end False \
74 |     --mm_use_im_patch_token False \
75 |     --bf16 True \
76 |     --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
77 |     --num_train_epochs 1 \
78 |     --per_device_train_batch_size 16 \
79 |     --per_device_eval_batch_size 4 \
80 |     --gradient_accumulation_steps 1 \
81 |     --evaluation_strategy "no" \
82 |     --save_strategy "steps" \
83 |     --save_steps 50000 \
84 |     --save_total_limit 1 \
85 |     --learning_rate 2e-5 \
86 |     --weight_decay 0. \
87 |     --warmup_ratio 0.03 \
88 |     --lr_scheduler_type "cosine" \
89 |     --logging_steps 1 \
90 |     --tf32 True \
91 |     --model_max_length 2048 \
92 |     --gradient_checkpointing True \
93 |     --dataloader_num_workers 16 \
94 |     --lazy_preprocess True \
95 |     --report_to wandb
96 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
 4 | 
 5 | # Install yolk3k if not installed
 6 | if ! pip show yolk3k > /dev/null 2>&1; then
 7 |     pip install yolk3k
 8 | fi
 9 | 
10 | # Get the installed version of transformers
11 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
12 | 
13 | # Get the latest version of transformers from PyPI
14 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
15 | 
16 | # Check if the installed version is not the latest
17 | if [ "$installed_version" != "$latest_version" ]; then
18 |     pip install -U transformers
19 | fi
20 | 
21 | # Get the installed version of deepspeed
22 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
23 | 
24 | # Get the latest version of deepspeed from PyPI
25 | latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
26 | 
27 | # Check if the installed version is not the latest
28 | if [ "$installed_version" != "$latest_version" ]; then
29 |     pip install deepspeed==0.12.2
30 | fi
31 | 
32 | # Install yolk3k if not installed
33 | if ! pip show flash-attn > /dev/null 2>&1; then
34 |     pip install flash-attn --no-build-isolation
35 | fi
36 | 
37 | 
38 | ################## MISTRAL ##################
39 | PROMPT_VERSION=mistral_instruct
40 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
41 | ################## VICUNA ##################
42 | 
43 | 
44 | ################## project ##################
45 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
46 | 
47 | ################## data ##################
48 | DATA_NAME="mixtral_instruct_158K_V1"
49 | 
50 | # wandb configure
51 | export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
52 | wandb login $WANDB_API_KEY
53 | 
54 | export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
55 | 
56 | export WANDB_PROJECT=LLaVA_Mixtral
57 | 
58 | export WANDB_MODE=online
59 | 
60 | wandb online
61 | 
62 | 
63 | deepspeed --master_port 26000 \
64 |     llava/train/train_mem.py \
65 |     --deepspeed ./scripts/zero2.json \
66 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
67 |     --version $PROMPT_VERSION \
68 |     --data_path ./playground/data/$DATA_NAME.json \
69 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data/coco/train2017 \
70 |     --vision_tower openai/clip-vit-large-patch14 \
71 |     --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
72 |     --mm_vision_select_layer -2 \
73 |     --mm_projector_type mlp2x_gelu \
74 |     --mm_use_im_start_end False \
75 |     --mm_use_im_patch_token False \
76 |     --bf16 True \
77 |     --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
78 |     --num_train_epochs 1 \
79 |     --per_device_train_batch_size 16 \
80 |     --per_device_eval_batch_size 4 \
81 |     --gradient_accumulation_steps 1 \
82 |     --evaluation_strategy "no" \
83 |     --save_strategy "steps" \
84 |     --save_steps 50000 \
85 |     --save_total_limit 1 \
86 |     --learning_rate 2e-5 \
87 |     --weight_decay 0. \
88 |     --warmup_ratio 0.03 \
89 |     --lr_scheduler_type "cosine" \
90 |     --logging_steps 1 \
91 |     --tf32 True \
92 |     --model_max_length 2048 \
93 |     --gradient_checkpointing True \
94 |     --dataloader_num_workers 16 \
95 |     --lazy_preprocess True \
96 |     --report_to wandb
97 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral_copy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
 4 | 
 5 | # Install yolk3k if not installed
 6 | if ! pip show yolk3k > /dev/null 2>&1; then
 7 |     pip install yolk3k
 8 | fi
 9 | 
10 | # Get the installed version of transformers
11 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
12 | 
13 | # Get the latest version of transformers from PyPI
14 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
15 | 
16 | # Check if the installed version is not the latest
17 | if [ "$installed_version" != "$latest_version" ]; then
18 |     pip install -U transformers
19 | fi
20 | 
21 | # Get the installed version of deepspeed
22 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
23 | 
24 | # Get the latest version of deepspeed from PyPI
25 | latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
26 | 
27 | # Check if the installed version is not the latest
28 | if [ "$installed_version" != "$latest_version" ]; then
29 |     pip install deepspeed==0.12.2
30 | fi
31 | 
32 | # Install yolk3k if not installed
33 | if ! pip show flash-attn > /dev/null 2>&1; then
34 |     pip install flash-attn --no-build-isolation
35 | fi
36 | 
37 | 
38 | ################## MISTRAL ##################
39 | PROMPT_VERSION=mistral_instruct
40 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
41 | ################## VICUNA ##################
42 | 
43 | 
44 | ################## project ##################
45 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
46 | 
47 | ################## data ##################
48 | DATA_NAME="llava_instruct_150k"
49 | 
50 | # wandb configure
51 | export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
52 | wandb login $WANDB_API_KEY
53 | 
54 | export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
55 | 
56 | export WANDB_PROJECT=LLaVA_Mixtral
57 | 
58 | export WANDB_MODE=online
59 | 
60 | wandb online
61 | 
62 | 
63 | deepspeed --master_port 26000 \
64 |     llava/train/train_mem.py \
65 |     --deepspeed ./scripts/zero2.json \
66 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
67 |     --version $PROMPT_VERSION \
68 |     --data_path ./playground/data/$DATA_NAME.json \
69 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data/coco/train2017 \
70 |     --vision_tower openai/clip-vit-large-patch14 \
71 |     --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
72 |     --mm_vision_select_layer -2 \
73 |     --mm_projector_type mlp2x_gelu \
74 |     --mm_use_im_start_end False \
75 |     --mm_use_im_patch_token False \
76 |     --bf16 True \
77 |     --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
78 |     --num_train_epochs 1 \
79 |     --per_device_train_batch_size 16 \
80 |     --per_device_eval_batch_size 4 \
81 |     --gradient_accumulation_steps 1 \
82 |     --evaluation_strategy "no" \
83 |     --save_strategy "steps" \
84 |     --save_steps 50000 \
85 |     --save_total_limit 1 \
86 |     --learning_rate 2e-5 \
87 |     --weight_decay 0. \
88 |     --warmup_ratio 0.03 \
89 |     --lr_scheduler_type "cosine" \
90 |     --logging_steps 1 \
91 |     --tf32 True \
92 |     --model_max_length 2048 \
93 |     --gradient_checkpointing True \
94 |     --dataloader_num_workers 16 \
95 |     --lazy_preprocess True \
96 |     --report_to wandb
97 | 


--------------------------------------------------------------------------------
/scripts/archived/train/finetune_ov.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="Qwen/Qwen2-7B-Instruct" 
 8 | # for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03
 9 | # for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03
10 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
11 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
12 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
13 | 
14 | ############### Pretrain ################
15 | 
16 | BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | ############### Finetune ################
20 | 
21 | # Stage 2
22 | PROMPT_VERSION="qwen_1_5"
23 | RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_stage_am9" 
24 | PREV_STAGE_CHECKPOINT="/mnt/bn/vl-research/checkpoints/onevision/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mid_to_final_next_3m_am9_july14" # replace it with your last checkpoint training from single image collection
25 | echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
26 | echo "MID_RUN_NAME: ${RUN_NAME}"
27 | 
28 | ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
29 |     llava/train/train_mem.py \
30 |     --deepspeed scripts/zero3.json \
31 |     --model_name_or_path $PREV_STAGE_CHECKPOINT \
32 |     --version $PROMPT_VERSION \
33 |     --data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_ov_stage_july21.yaml \
34 |     --image_folder /mnt/bn/vl-research/data/llava_data \
35 |     --video_folder /mnt/bn/vl-research/data/llava_video \
36 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
37 |     --mm_vision_tower_lr=2e-6 \
38 |     --vision_tower ${VISION_MODEL_VERSION} \
39 |     --mm_projector_type mlp2x_gelu \
40 |     --mm_vision_select_layer -2 \
41 |     --mm_use_im_start_end False \
42 |     --mm_use_im_patch_token False \
43 |     --group_by_modality_length True \
44 |     --image_aspect_ratio anyres_max_9 \
45 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
46 |     --mm_patch_merge_type spatial_unpad \
47 |     --bf16 True \
48 |     --run_name $RUN_NAME \
49 |     --output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \
50 |     --num_train_epochs 1 \
51 |     --per_device_train_batch_size 1 \
52 |     --per_device_eval_batch_size 4 \
53 |     --gradient_accumulation_steps 2 \
54 |     --evaluation_strategy "no" \
55 |     --save_strategy "steps" \
56 |     --save_steps 1000 \
57 |     --save_total_limit 1 \
58 |     --learning_rate 1e-5 \
59 |     --weight_decay 0. \
60 |     --warmup_ratio 0.03 \
61 |     --lr_scheduler_type "cosine" \
62 |     --logging_steps 1 \
63 |     --tf32 True \
64 |     --model_max_length 32768 \
65 |     --gradient_checkpointing True \
66 |     --dataloader_num_workers 4 \
67 |     --lazy_preprocess True \
68 |     --report_to wandb \
69 |     --torch_compile True \
70 |     --torch_compile_backend "inductor" \
71 |     --dataloader_drop_last True \
72 |     --frames_upbound 32
73 | exit 0;
74 | 
75 | # You can delete the sdpa attn_implementation if you want to use flash attn
76 | 


--------------------------------------------------------------------------------
/scripts/archived/video/eval/video_detail_description_eval_shard.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/llava-next-video"
 3 | 
 4 | if [ ! -e $ROOT_DIR ]; then
 5 |     echo "The root dir does not exist. Exiting the script."
 6 |     exit 1
 7 | fi
 8 | 
 9 | cd $ROOT_DIR
10 | 
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 | 
14 | OPENAIKEY="INPUT YOUR OPENAI API"
15 | 
16 | CKPT=$1
17 | CONV_MODE=$2
18 | FRAMES=$3
19 | POOL_STRIDE=$4
20 | OVERWRITE=$5
21 | CHUNKS=${6:-1}
22 | 
23 | echo "Using $CHUNKS GPUs"
24 | 
25 | if [ "$OVERWRITE" = False ]; then
26 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
27 | 
28 | else
29 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
30 | fi
31 | 
32 | # Assuming GPULIST is a bash array containing your GPUs
33 | GPULIST=(0 1 2 3 4 5 6 7)
34 | 
35 | # Get the number of GPUs
36 | NUM_GPUS=${#GPULIST[@]}
37 | 
38 | # Calculate GPUs per chunk
39 | GPUS_PER_CHUNK=$((NUM_GPUS / CHUNKS))
40 | 
41 | 
42 | for IDX in $(seq 1 $CHUNKS); do
43 |     START=$(((IDX-1) * GPUS_PER_CHUNK))
44 |     LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
45 |     
46 |     CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
47 |     
48 |     # Convert the chunk GPUs array to a comma-separated string
49 |     CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
50 | 
51 |     # ALL_GPUS_FREE=0
52 |     # while [ $ALL_GPUS_FREE -eq 0 ]; do
53 |     #     ALL_GPUS_FREE=1  # Assume all GPUs are free initially
54 |         
55 |     #     for GPU_ID in $CHUNK_GPUS; do
56 |     #         MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
57 |             
58 |     #         # Assuming a GPU is considered free if its memory usage is less than 100 MiB
59 |     #         if [ "$MEM_USAGE" -ge 100 ]; then
60 |     #             ALL_GPUS_FREE=0
61 |     #             echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
62 |     #             break  # Exit the loop early as we found a GPU that is not free
63 |     #         fi
64 |     #     done
65 |         
66 |     #     if [ $ALL_GPUS_FREE -eq 0 ]; then
67 |     #         echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
68 |     #         sleep 100
69 |     #     fi
70 |     # done
71 |     
72 |     echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
73 |     CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_detail_description.py \
74 |         --model-path $CKPT \
75 |         --video_dir ./data/llava_video/video-chatgpt/evaluation/Test_Videos/ \
76 |         --output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR \
77 |         --output_name pred \
78 |         --num-chunks $CHUNKS \
79 |         --chunk-idx $(($IDX - 1)) \
80 |         --overwrite ${OVERWRITE} \
81 |         --mm_spatial_pool_stride ${POOL_STRIDE:-4} \
82 |         --for_get_frames_num $FRAMES \
83 |         --conv-mode $CONV_MODE &
84 | done
85 | 
86 | wait
87 | 
88 | python3 llava/eval/evaluate_benchmark_video_detail_description.py \
89 |     --pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR \
90 |     --output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \
91 |     --output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \
92 |     --num_chunks $CHUNKS \
93 |     --num_tasks 16 \
94 |     --api_key $OPENAIKEY \
95 | 
96 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/convert_sqa_to_llava.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fire
 4 | import re
 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
 6 | 
 7 | 
 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
 9 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
11 | 
12 |     split_problems = build_prompt_chatbot(problems, split_indices, prompt_format, use_caption=False, is_test=False)
13 | 
14 |     target_format = []
15 |     for prob_id, (input, output) in split_problems.items():
16 |         if input.startswith("Question: "):
17 |             input = input.replace("Question: ", "")
18 |         if output.startswith("Answer: "):
19 |             output = output.replace("Answer: ", "")
20 | 
21 |         raw_prob_data = problems[prob_id]
22 |         if raw_prob_data["image"] is None:
23 |             target_format.append(
24 |                 {
25 |                     "id": prob_id,
26 |                     "conversations": [
27 |                         {"from": "human", "value": f"{input}"},
28 |                         {"from": "gpt", "value": f"{output}"},
29 |                     ],
30 |                 }
31 |             )
32 | 
33 |         else:
34 |             target_format.append(
35 |                 {
36 |                     "id": prob_id,
37 |                     "image": os.path.join(prob_id, raw_prob_data["image"]),
38 |                     "conversations": [
39 |                         {"from": "human", "value": f"{input}\n<image>"},
40 |                         {"from": "gpt", "value": f"{output}"},
41 |                     ],
42 |                 }
43 |             )
44 | 
45 |     print(f"Number of samples: {len(target_format)}")
46 | 
47 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
48 |         json.dump(target_format, f, indent=2)
49 | 
50 | 
51 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
52 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
53 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
54 | 
55 |     split_problems = build_prompt_chatbot(problems, split_indices, prompt_format, use_caption=False, is_test=False)
56 | 
57 |     writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58 |     for prob_id, (input, output) in split_problems.items():
59 |         if input.startswith("Question: "):
60 |             input = input.replace("Question: ", "")
61 |         if output.startswith("Answer: "):
62 |             output = output.replace("Answer: ", "")
63 | 
64 |         raw_prob_data = problems[prob_id]
65 |         if raw_prob_data["image"] is None:
66 |             data = {
67 |                 "id": prob_id,
68 |                 "instruction": f"{input}",
69 |                 "output": f"{output}",
70 |             }
71 | 
72 |         else:
73 |             data = {
74 |                 "id": prob_id,
75 |                 "image": os.path.join(prob_id, raw_prob_data["image"]),
76 |                 "instruction": f"{input}\n<image>",
77 |                 "output": f"{output}",
78 |             }
79 |         writer.write(json.dumps(data) + "\n")
80 |     writer.close()
81 | 
82 | 
83 | def main(task, **kwargs):
84 |     globals()[task](**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     fire.Fire(main)
89 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_1.5.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | dataset_name=$1
  3 | 
  4 | # Uncomment and set the following variables correspondingly to run this script:
  5 | 
  6 | cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
  7 | 
  8 | # Install yolk3k if not installed
  9 | if ! pip show yolk3k > /dev/null 2>&1; then
 10 |     pip install yolk3k
 11 | fi
 12 | 
 13 | # Get the installed version of transformers
 14 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
 15 | 
 16 | # Get the latest version of transformers from PyPI
 17 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
 18 | 
 19 | # Check if the installed version is not the latest
 20 | if [ "$installed_version" != "$latest_version" ]; then
 21 |     pip install -U transformers
 22 | fi
 23 | 
 24 | # Get the installed version of deepspeed
 25 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
 26 | 
 27 | # Get the latest version of deepspeed from PyPI
 28 | latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
 29 | 
 30 | # Check if the installed version is not the latest
 31 | if [ "$installed_version" != "$latest_version" ]; then
 32 |     pip install deepspeed==0.12.2
 33 | fi
 34 | 
 35 | # Install yolk3k if not installed
 36 | if ! pip show flash-attn > /dev/null 2>&1; then
 37 |     pip install flash-attn --no-build-isolation
 38 | fi
 39 | 
 40 | 
 41 | ################## VICUNA ##################
 42 | PROMPT_VERSION=v1
 43 | MODEL_VERSION="vicuna-7b-v1-5"
 44 | ################## VICUNA ##################
 45 | 
 46 | ################## project ##################
 47 | PROJECT_NAME="ds_llava-vicuna-7b-v1-5-mlp2x_gelu-pretrain_blip558k_plain"
 48 | 
 49 | ################## data ##################
 50 | DATA_NAME=$dataset_name
 51 | 
 52 | 
 53 | # wandb configure
 54 | export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
 55 | wandb login $WANDB_API_KEY
 56 | 
 57 | export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
 58 | 
 59 | export WANDB_PROJECT=LLaVA_Mixtral
 60 | 
 61 | export WANDB_MODE=online
 62 | 
 63 | wandb online
 64 | 
 65 | 
 66 | deepspeed --master_port 26000 \
 67 |     llava/train/train_mem.py \
 68 |     --deepspeed ./scripts/zero2.json \
 69 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
 70 |     --version $PROMPT_VERSION \
 71 |     --data_path ./playground/data/$DATA_NAME.json \
 72 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
 73 |     --vision_tower openai/clip-vit-large-patch14 \
 74 |     --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
 75 |     --mm_vision_select_layer -2 \
 76 |     --mm_projector_type mlp2x_gelu \
 77 |     --mm_use_im_start_end False \
 78 |     --mm_use_im_patch_token False \
 79 |     --bf16 True \
 80 |     --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
 81 |     --num_train_epochs 1 \
 82 |     --per_device_train_batch_size 16 \
 83 |     --per_device_eval_batch_size 4 \
 84 |     --gradient_accumulation_steps 1 \
 85 |     --evaluation_strategy "no" \
 86 |     --save_strategy "steps" \
 87 |     --save_steps 50000 \
 88 |     --save_total_limit 1 \
 89 |     --learning_rate 2e-5 \
 90 |     --weight_decay 0. \
 91 |     --warmup_ratio 0.03 \
 92 |     --lr_scheduler_type "cosine" \
 93 |     --logging_steps 1 \
 94 |     --tf32 True \
 95 |     --model_max_length 2048 \
 96 |     --gradient_checkpointing True \
 97 |     --dataloader_num_workers 16 \
 98 |     --lazy_preprocess True \
 99 |     --report_to wandb
100 | 


--------------------------------------------------------------------------------
/scripts/mira_train/sft_clip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="meta-llama/Meta-Llama-3-8B" 
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="biomedclip"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION="llama_v3"
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | MID_RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-sft_stage_am9"
20 | 
21 | CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
22 | 
23 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
24 | # --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
25 | deepspeed llava/train/train_mem.py \
26 |     --deepspeed scripts/zero2_offload.json \
27 |     --model_name_or_path ${CKPT_PATH} \
28 |     --version ${PROMPT_VERSION} \
29 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/instruct/llava_med_instruct_60k.json \
30 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
31 |     --pretrain_mm_mlp_adapter /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-biomedclip-llama3-8b/mm_projector.bin \
32 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
33 |     --mm_vision_tower_lr=2e-6 \
34 |     --vision_tower ${VISION_MODEL_VERSION} \
35 |     --mm_projector_type mlp2x_gelu \
36 |     --mm_vision_select_layer -2 \
37 |     --mm_use_im_start_end False \
38 |     --mm_use_im_patch_token False \
39 |     --group_by_modality_length True \
40 |     --rag_enabled True \
41 |     --rag_idx /home/jinhong.wang/workdir/database_rag/faiss_index_1222.idx \
42 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/metadata_1222.csv \
43 |     --rag_tokenizer all-MiniLM-L6-v2 \
44 |     --rag_topk 5 \
45 |     --query_rewrite_enabled False \
46 |     --query_rewrite_host http://localhost:11434/api/chat \
47 |     --query_rewrite_model mistral-small:22b \
48 |     --image_aspect_ratio anyres \
49 |     --image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
50 |     --mm_patch_merge_type spatial_unpad \
51 |     --bf16 True \
52 |     --run_name $MID_RUN_NAME \
53 |     --output_dir "/home/jinhong.wang/workdir/checkpoints/ft-lmed-rag/${MID_RUN_NAME}" \
54 |     --num_train_epochs 1 \
55 |     --per_device_train_batch_size 1 \
56 |     --per_device_eval_batch_size 1 \
57 |     --gradient_accumulation_steps 32 \
58 |     --evaluation_strategy "no" \
59 |     --save_strategy "steps" \
60 |     --save_steps 1000 \
61 |     --save_total_limit 1 \
62 |     --learning_rate 1e-5 \
63 |     --weight_decay 0. \
64 |     --warmup_ratio 0.03 \
65 |     --lr_scheduler_type "cosine" \
66 |     --logging_steps 1 \
67 |     --tf32 True \
68 |     --model_max_length 32768 \
69 |     --gradient_checkpointing True \
70 |     --dataloader_num_workers 16 \
71 |     --lazy_preprocess True \
72 |     --report_to wandb \
73 |     --torch_compile True \
74 |     --torch_compile_backend "inductor" \
75 |     --dataloader_drop_last True \
76 |     --attn_implementation sdpa
77 | 
78 | # You can delete the sdpa attn_implementation if you want to use flash attn
79 | 


--------------------------------------------------------------------------------
/scripts/mira_train/sft_siglip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="meta-llama/Meta-Llama-3-8B" 
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION="llama_v3"
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | MID_RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-sft_stage_am9"
20 | 
21 | CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
22 | 
23 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
24 | # --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
25 | deepspeed llava/train/train_mem.py \
26 |     --deepspeed scripts/zero2_offload.json \
27 |     --model_name_or_path ${CKPT_PATH} \
28 |     --version ${PROMPT_VERSION} \
29 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/instruct/llava_med_instruct_60k.json \
30 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
31 |     --pretrain_mm_mlp_adapter /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-siglip-llama3-8b/mm_projector.bin \
32 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
33 |     --mm_vision_tower_lr=2e-6 \
34 |     --vision_tower ${VISION_MODEL_VERSION} \
35 |     --mm_projector_type mlp2x_gelu \
36 |     --mm_vision_select_layer -2 \
37 |     --mm_use_im_start_end False \
38 |     --mm_use_im_patch_token False \
39 |     --group_by_modality_length True \
40 |     --rag_enabled True \
41 |     --rag_idx /home/jinhong.wang/workdir/database_rag/faiss_index_1222.idx \
42 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/metadata_1222.csv \
43 |     --rag_tokenizer all-MiniLM-L6-v2 \
44 |     --rag_topk 5 \
45 |     --query_rewrite_enabled False \
46 |     --query_rewrite_host http://localhost:11434/api/chat \
47 |     --query_rewrite_model mistral-small:22b \
48 |     --image_aspect_ratio anyres \
49 |     --image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
50 |     --mm_patch_merge_type spatial_unpad \
51 |     --bf16 True \
52 |     --run_name $MID_RUN_NAME \
53 |     --output_dir "/home/jinhong.wang/workdir/checkpoints/ft-lmed-rag/${MID_RUN_NAME}" \
54 |     --num_train_epochs 1 \
55 |     --per_device_train_batch_size 1 \
56 |     --per_device_eval_batch_size 1 \
57 |     --gradient_accumulation_steps 32 \
58 |     --evaluation_strategy "no" \
59 |     --save_strategy "steps" \
60 |     --save_steps 1000 \
61 |     --save_total_limit 1 \
62 |     --learning_rate 1e-5 \
63 |     --weight_decay 0. \
64 |     --warmup_ratio 0.03 \
65 |     --lr_scheduler_type "cosine" \
66 |     --logging_steps 1 \
67 |     --tf32 True \
68 |     --model_max_length 32768 \
69 |     --gradient_checkpointing True \
70 |     --dataloader_num_workers 16 \
71 |     --lazy_preprocess True \
72 |     --report_to wandb \
73 |     --torch_compile True \
74 |     --torch_compile_backend "inductor" \
75 |     --dataloader_drop_last True \
76 |     --attn_implementation sdpa
77 | 
78 | # You can delete the sdpa attn_implementation if you want to use flash attn
79 | 


--------------------------------------------------------------------------------
/scripts/mira_train/sft_biomedclip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="meta-llama/Meta-Llama-3-8B" 
 8 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
 9 | VISION_MODEL_VERSION="biomedclip"
10 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
11 | 
12 | ############### Pretrain ################
13 | 
14 | PROMPT_VERSION="llama_v3"
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | MID_RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-sft_stage_am9"
20 | 
21 | CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
22 | 
23 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
24 | # --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
25 | 
26 |     # --image_aspect_ratio anyres \
27 |     # --image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
28 |     # --mm_patch_merge_type spatial_unpad \
29 | 
30 | deepspeed llava/train/train_mem.py \
31 |     --deepspeed scripts/zero2_offload.json \
32 |     --model_name_or_path ${CKPT_PATH} \
33 |     --version ${PROMPT_VERSION} \
34 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/instruct/llava_med_instruct_60k.json \
35 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
36 |     --pretrain_mm_mlp_adapter /home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-biomedclip-llama3-8b/mm_projector.bin \
37 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
38 |     --mm_vision_tower_lr=2e-6 \
39 |     --vision_tower ${VISION_MODEL_VERSION} \
40 |     --mm_projector_type mlp2x_gelu \
41 |     --mm_vision_select_layer -2 \
42 |     --mm_use_im_start_end False \
43 |     --mm_use_im_patch_token False \
44 |     --group_by_modality_length True \
45 |     --rag_enabled True \
46 |     --rag_idx /home/jinhong.wang/workdir/database_rag/faiss_index_1222.idx \
47 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/metadata_1222.csv \
48 |     --rag_tokenizer all-MiniLM-L6-v2 \
49 |     --rag_topk 5 \
50 |     --query_rewrite_enabled False \
51 |     --query_rewrite_host http://localhost:11434/api/chat \
52 |     --query_rewrite_model mistral-small:22b \
53 |     --bf16 True \
54 |     --run_name $MID_RUN_NAME \
55 |     --output_dir "/home/jinhong.wang/workdir/checkpoints/ft-lmed-rag/${MID_RUN_NAME}" \
56 |     --num_train_epochs 1 \
57 |     --per_device_train_batch_size 1 \
58 |     --per_device_eval_batch_size 1 \
59 |     --gradient_accumulation_steps 32 \
60 |     --evaluation_strategy "no" \
61 |     --save_strategy "steps" \
62 |     --save_steps 1000 \
63 |     --save_total_limit 1 \
64 |     --learning_rate 1e-5 \
65 |     --weight_decay 0. \
66 |     --warmup_ratio 0.03 \
67 |     --lr_scheduler_type "cosine" \
68 |     --logging_steps 1 \
69 |     --tf32 True \
70 |     --model_max_length 32768 \
71 |     --gradient_checkpointing True \
72 |     --dataloader_num_workers 16 \
73 |     --lazy_preprocess True \
74 |     --report_to wandb \
75 |     --torch_compile True \
76 |     --torch_compile_backend "inductor" \
77 |     --dataloader_drop_last True \
78 |     --attn_implementation sdpa
79 | 
80 | # You can delete the sdpa attn_implementation if you want to use flash attn
81 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral_1.6_336px_anyres_freeze_vision.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | dataset_name=$1
 3 | 
 4 | cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
 5 | 
 6 | # Install yolk3k if not installed
 7 | if ! pip show yolk3k > /dev/null 2>&1; then
 8 |     pip install yolk3k
 9 | fi
10 | 
11 | pip install pydantic
12 | 
13 | # Get the installed version of transformers
14 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
15 | 
16 | # Get the latest version of transformers from PyPI
17 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
18 | 
19 | # Check if the installed version is not the latest
20 | if [ "$installed_version" != "4.36.2" ]; then
21 |     pip install transformers==4.36.2
22 | fi
23 | 
24 | # Get the installed version of deepspeed
25 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
26 | 
27 | 
28 | # Check if the installed version is not the latest
29 | if [ "$installed_version" != "0.12.2" ]; then
30 |     pip install deepspeed==0.12.2
31 | fi
32 | 
33 | # Install flash-atten if not installed
34 | if ! pip show flash-attn > /dev/null 2>&1; then
35 |     pip install flash-attn --no-build-isolation
36 | fi
37 | 
38 | ################## MISTRAL ##################
39 | PROMPT_VERSION=mistral_instruct
40 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
41 | ################## MISTRAL ##################
42 | 
43 | 
44 | ################## project ##################
45 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
46 | 
47 | ################## data ##################
48 | DATA_NAME=$dataset_name
49 | 
50 | 
51 | # wandb configure
52 | export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
53 | export WANDB_PROJECT=llava
54 | 
55 | export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--unfreeze--anyres--sft
56 | 
57 | export WANDB_MODE=online
58 | 
59 | wandb online
60 | 
61 | deepspeed --master_port 26000 \
62 |     llava/train/train_mem.py \
63 |     --deepspeed ./scripts/zero3.json \
64 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
65 |     --version $PROMPT_VERSION \
66 |     --data_path ./playground/data/$DATA_NAME.json \
67 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
68 |     --vision_tower openai/clip-vit-large-patch14-336 \
69 |     --pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
70 |     --mm_vision_select_layer -2 \
71 |     --mm_projector_type mlp2x_gelu \
72 |     --mm_use_im_start_end False \
73 |     --mm_use_im_patch_token False \
74 |     --group_by_modality_length True \
75 |     --image_aspect_ratio anyres \
76 |     --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
77 |     --mm_patch_merge_type spatial_unpad \
78 |     --bf16 True \
79 |     --output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--unfreeze--sft \
80 |     --num_train_epochs 1 \
81 |     --per_device_train_batch_size 16 \
82 |     --per_device_eval_batch_size 4 \
83 |     --gradient_accumulation_steps 1 \
84 |     --evaluation_strategy "no" \
85 |     --save_strategy "steps" \
86 |     --save_steps 50000 \
87 |     --save_total_limit 1 \
88 |     --learning_rate 2e-5 \
89 |     --weight_decay 0. \
90 |     --warmup_ratio 0.03 \
91 |     --lr_scheduler_type "cosine" \
92 |     --logging_steps 1 \
93 |     --tf32 True \
94 |     --model_max_length 2048 \
95 |     --gradient_checkpointing True \
96 |     --dataloader_num_workers 16 \
97 | 
98 | 


--------------------------------------------------------------------------------
/scripts/archived/video/eval/video_description_from_t2v.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/llava-next-video"
 3 | 
 4 | if [ ! -e $ROOT_DIR ]; then
 5 |     echo "The root dir does not exist. Exiting the script."
 6 |     exit 1
 7 | fi
 8 | 
 9 | cd $ROOT_DIR
10 | 
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 | 
14 | CKPT=$1
15 | CONV_MODE=$2
16 | FRAMES=$3
17 | POOL_STRIDE=$4
18 | OVERWRITE=$5
19 | CHUNKS=${6:-1}
20 | DO_CENTER_CROP=${7:-False}
21 | 
22 | echo "Using $CHUNKS GPUs"
23 | 
24 | LOAD_8BIT=False
25 | 
26 | 
27 | if [ "$OVERWRITE" = False ]; then
28 |     if [ "$MODEL_MAX_LENGTH" = 0 ]; then
29 |         SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_overwrite_${OVERWRITE}
30 |     else
31 |         SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_overwrite_${OVERWRITE}
32 |     fi
33 | else
34 |     SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
35 | fi
36 | 
37 | SAVE_DIR=${SAVE_DIR}_do_center_crop_${DO_CENTER_CROP}
38 | # Assuming GPULIST is a bash array containing your GPUs
39 | GPULIST=(0 1 2 3 4 5 6 7)
40 | # GPULIST=(0)
41 | 
42 | # Get the number of GPUs
43 | NUM_GPUS=${#GPULIST[@]}
44 | 
45 | # Calculate GPUs per chunk
46 | GPUS_PER_CHUNK=$((NUM_GPUS / CHUNKS))
47 | 
48 | 
49 | for IDX in $(seq 1 $CHUNKS); do
50 |     START=$(((IDX-1) * GPUS_PER_CHUNK))
51 |     LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
52 |     
53 |     CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
54 |     
55 |     # Convert the chunk GPUs array to a comma-separated string
56 |     CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
57 | 
58 |     # ALL_GPUS_FREE=0
59 |     # while [ $ALL_GPUS_FREE -eq 0 ]; do
60 |     #     ALL_GPUS_FREE=1  # Assume all GPUs are free initially
61 |         
62 |     #     for GPU_ID in $CHUNK_GPUS; do
63 |     #         MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
64 |             
65 |     #         # Assuming a GPU is considered free if its memory usage is less than 100 MiB
66 |     #         if [ "$MEM_USAGE" -ge 100 ]; then
67 |     #             ALL_GPUS_FREE=0
68 |     #             echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
69 |     #             break  # Exit the loop early as we found a GPU that is not free
70 |     #         fi
71 |     #     done
72 |         
73 |     #     if [ $ALL_GPUS_FREE -eq 0 ]; then
74 |     #         echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
75 |     #         sleep 100
76 |     #     fi
77 |     # done
78 |     
79 |     echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
80 |     CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_description_from_t2v.py \
81 |         --model-path $CKPT \
82 |         --gt_file /mnt/bn/vl-research-1t/tuyen/webvid_hdvg_movie_pond5_for_captioning_evaluation/webvid_hdvg_movie_pond5_for_captioning_evaluation.processed.csv \
83 |         --output_dir ./work_dirs/eval_video_description_from_t2v/$SAVE_DIR \
84 |         --output_name pred \
85 |         --num-chunks $CHUNKS \
86 |         --chunk-idx $(($IDX - 1)) \
87 |         --overwrite ${OVERWRITE} \
88 |         --mm_spatial_pool_stride ${POOL_STRIDE:-4} \
89 |         --for_get_frames_num $FRAMES \
90 |         --load_8bit $LOAD_8BIT \
91 |         --do_center_crop $DO_CENTER_CROP \
92 |         --conv-mode $CONV_MODE &
93 | done
94 | 
95 | wait
96 | 
97 | cat ${ROOT_DIR}/work_dirs/eval_video_description_from_t2v/$SAVE_DIR/${CHUNKS}* > ${ROOT_DIR}/work_dirs/eval_video_description_from_t2v/$SAVE_DIR/pred.json
98 | 
99 | 


--------------------------------------------------------------------------------
/scripts/archived/train/finetune_siglip.sh:
--------------------------------------------------------------------------------
 1 | export OMP_NUM_THREADS=8
 2 | export NCCL_IB_DISABLE=0
 3 | export NCCL_IB_GID_INDEX=3
 4 | # export NCCL_SOCKET_IFNAME=eth0
 5 | export NCCL_DEBUG=INFO
 6 | 
 7 | LLM_VERSION="meta-llama/Meta-Llama-3-8B" 
 8 | # for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03
 9 | # for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03
10 | LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
11 | VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
12 | VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
13 | 
14 | ############### Pretrain ################
15 | 
16 | BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
17 | echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
18 | 
19 | ############### Finetune ################
20 | 
21 | # Stage 2
22 | PROMPT_VERSION="llama_v3"
23 | RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-si_stage_am9" 
24 | PREV_STAGE_CHECKPOINT="/home/jinhong.wang/workdir/checkpoints/pt-projector/llavamed-lnext-rag-llama3-8b" # replace it with your last checkpoint training from mid stage
25 | echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
26 | echo "MID_RUN_NAME: ${RUN_NAME}"
27 | 
28 | # ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
29 | deepspeed llava/train/train_mem.py \
30 |     --deepspeed scripts/zero3.json \
31 |     --model_name_or_path $PREV_STAGE_CHECKPOINT \
32 |     --version $PROMPT_VERSION \
33 |     --data_path /home/jinhong.wang/workdir/dataset/llava_med_jsons/checked/instruct/llava_med_instruct_60k.json \
34 |     --image_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
35 |     --video_folder /home/jinhong.wang/workdir/dataset/llava_med/images \
36 |     --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
37 |     --mm_vision_tower_lr=2e-6 \
38 |     --vision_tower ${VISION_MODEL_VERSION} \
39 |     --mm_projector_type mlp2x_gelu \
40 |     --mm_vision_select_layer -2 \
41 |     --mm_use_im_start_end False \
42 |     --mm_use_im_patch_token False \
43 |     --group_by_modality_length True \
44 |     --rag_enabled True \
45 |     --rag_idx /home/jinhong.wang/workdir/database_rag/faiss_index_1222.idx \
46 |     --rag_mdpath /home/jinhong.wang/workdir/database_rag/metadata_1222.csv \
47 |     --rag_tokenizer all-MiniLM-L6-v2 \
48 |     --rag_topk 5 \
49 |     --query_rewrite_enabled False \
50 |     --query_rewrite_host http://localhost:11434/api/chat \
51 |     --query_rewrite_model mistral-small:22b \
52 |     --image_aspect_ratio anyres_max_9 \
53 |     --image_grid_pinpoints  "(1x1),...,(6x6)" \
54 |     --mm_patch_merge_type spatial_unpad \
55 |     --bf16 True \
56 |     --run_name $RUN_NAME \
57 |     --output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \
58 |     --num_train_epochs 1 \
59 |     --per_device_train_batch_size 8 \
60 |     --per_device_eval_batch_size 1 \
61 |     --gradient_accumulation_steps 4 \
62 |     --evaluation_strategy "no" \
63 |     --save_strategy "steps" \
64 |     --save_steps 1000 \
65 |     --save_total_limit 1 \
66 |     --learning_rate 1e-5 \
67 |     --weight_decay 0. \
68 |     --warmup_ratio 0.03 \
69 |     --lr_scheduler_type "cosine" \
70 |     --logging_steps 1 \
71 |     --tf32 True \
72 |     --model_max_length 32768 \
73 |     --gradient_checkpointing True \
74 |     --dataloader_num_workers 4 \
75 |     --lazy_preprocess True \
76 |     --report_to wandb \
77 |     --torch_compile True \
78 |     --torch_compile_backend "inductor" \
79 |     --dataloader_drop_last True \
80 |     --frames_upbound 32 \
81 |     --attn_implementation sdpa
82 | 
83 | exit 0;
84 | 


--------------------------------------------------------------------------------
/llava/train/llava_trainer_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | from llava.train.llava_trainer import LLaVATrainer
 5 | 
 6 | 
 7 | class LLaVAEvalTrainer(LLaVATrainer):
 8 |     def evaluate(self, evaluate_args):
 9 |         cmd = f"accelerate launch --num_processes {evaluate_args.eval_num_processes} -m lmms_eval \
10 |                 --model {evaluate_args.model} \
11 |                 --model_args {evaluate_args.model_args} \
12 |                 --tasks {evaluate_args.task_names} \
13 |                 --batch_size {evaluate_args.batch_size} \
14 |                 --log_samples_suffix {evaluate_args.log_samples_suffix} \
15 |                 --output_path {evaluate_args.output_path}"
16 |         if evaluate_args.limit:
17 |             cmd += f" --limit {evaluate_args.limit}"
18 |         if evaluate_args.num_fewshot:
19 |             cmd += f" --num_fewshot {evaluate_args.num_fewshot}"
20 |         if evaluate_args.gen_kwargs != "":
21 |             cmd += f" --gen_kwargs {evaluate_args.gen_kwargs}"
22 |         if evaluate_args.log_samples:
23 |             cmd += f" --log_samples"
24 |         else:
25 |             assert False, "Please log samples so that the result can be parsed"
26 |         results = subprocess.run([cmd], shell=True, capture_output=True, text=True)
27 |         try:
28 |             result_file_index_start = results.stdout.index("Saved samples to ")
29 |             result_file_index_end = results.stdout.index(f".json")
30 |             result_file_index_start += len("Saved samples to ")
31 |             file = results.stdout[result_file_index_start:result_file_index_end]
32 |         except:
33 |             result_file_index_start = results.stderr.index("Saved samples to ")
34 |             result_file_index_end = results.stderr.index(f".json")
35 |             result_file_index_start += len("Saved samples to ")
36 |             file = results.stderr[result_file_index_start:result_file_index_end]
37 |         file = file.split("/")[:-1]
38 |         file = "/".join(file) + "/results.json"
39 |         with open(file, "r") as f:
40 |             lmms_eval_results = json.load(f)
41 |         result_dict = {}
42 |         tasks_list = evaluate_args.task_names.split(",")
43 |         for task in tasks_list:
44 |             task_results = lmms_eval_results["results"][task]
45 |             for k, v in task_results.items():
46 |                 if k != "alias" and "stderr" not in k:
47 |                     metric = k.split(",")[0]
48 |                     result_dict[f"{task}_{metric}"] = v
49 |         return result_dict
50 | 
51 |     """def evaluate(self, evaluate_args):
52 |         initialize_tasks()
53 |         tasks_list = evaluate_args.task_names.split(",")
54 |         result_dict = {}
55 |         results = evaluator.simple_evaluate(
56 |             model=evaluate_args.model,
57 |             model_args=evaluate_args.model_args,
58 |             tasks=tasks_list,
59 |             num_fewshot=evaluate_args.num_fewshot,
60 |             batch_size=evaluate_args.batch_size,
61 |             device=evaluate_args.device,
62 |             limit=evaluate_args.limit,
63 |             check_integrity=evaluate_args.check_integrity,
64 |             show_task_to_terminal=evaluate_args.show_task_to_terminal,
65 |             log_samples=evaluate_args.log_samples,
66 |             gen_kwargs=evaluate_args.gen_kwargs,
67 |             cli_args=evaluate_args,
68 |         )
69 |         for task in tasks_list:
70 |             task_results = results["results"][task]
71 |             for k,v in task_results.items():
72 |                 if k != "alias" and "stderr" not in k:
73 |                     metric = k.split(",")[0]
74 |                     result_dict[f"{task}_{metric}"] = v
75 |             
76 |         return result_dict"""
77 | 


--------------------------------------------------------------------------------
/scripts/archived/archived_prev/finetune_mixtral_1.6_336px_anyres.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | dataset_name=$1
  3 | 
  4 | cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
  5 | 
  6 | # Install yolk3k if not installed
  7 | if ! pip show yolk3k > /dev/null 2>&1; then
  8 |     pip install yolk3k
  9 | fi
 10 | 
 11 | pip install pydantic
 12 | 
 13 | # Get the installed version of transformers
 14 | installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
 15 | 
 16 | # Get the latest version of transformers from PyPI
 17 | latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
 18 | 
 19 | # Check if the installed version is not the latest
 20 | if [ "$installed_version" != "4.36.2" ]; then
 21 |     pip install transformers==4.36.2
 22 | fi
 23 | 
 24 | # Get the installed version of deepspeed
 25 | installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
 26 | 
 27 | 
 28 | # Check if the installed version is not the latest
 29 | if [ "$installed_version" != "0.12.2" ]; then
 30 |     pip install deepspeed==0.12.2
 31 | fi
 32 | 
 33 | # Install flash-atten if not installed
 34 | if ! pip show flash-attn > /dev/null 2>&1; then
 35 |     pip install flash-attn --no-build-isolation
 36 | fi
 37 | 
 38 | ################## MISTRAL ##################
 39 | PROMPT_VERSION=mistral_instruct
 40 | MODEL_VERSION="Mistral-7B-Instruct-v0.2"
 41 | ################## MISTRAL ##################
 42 | 
 43 | 
 44 | ################## project ##################
 45 | PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
 46 | 
 47 | ################## data ##################
 48 | DATA_NAME=$dataset_name
 49 | 
 50 | 
 51 | # wandb configure
 52 | export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
 53 | export WANDB_PROJECT=llava
 54 | 
 55 | export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
 56 | 
 57 | export WANDB_MODE=online
 58 | 
 59 | wandb online
 60 | 
 61 | deepspeed --master_port 26000 \
 62 |     llava/train/train_mem.py \
 63 |     --deepspeed ./scripts/zero3.json \
 64 |     --model_name_or_path /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/$MODEL_VERSION \
 65 |     --version $PROMPT_VERSION \
 66 |     --data_path ./playground/data/$DATA_NAME.json \
 67 |     --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
 68 |     --vision_tower openai/clip-vit-large-patch14-336 \
 69 |     --pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
 70 |     --mm_projector_type mlp2x_gelu \
 71 |     --mm_vision_select_layer -2 \
 72 |     --mm_use_im_start_end False \
 73 |     --mm_use_im_patch_token False \
 74 |     --group_by_modality_length True \
 75 |     --unfreeze_mm_vision_tower True \
 76 |     --mm_vision_tower_lr 2e-6 \
 77 |     --image_aspect_ratio anyres \
 78 |     --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
 79 |     --mm_patch_merge_type spatial_unpad \
 80 |     --bf16 True \
 81 |     --output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
 82 |     --num_train_epochs 9 \
 83 |     --per_device_train_batch_size 8 \
 84 |     --per_device_eval_batch_size 4 \
 85 |     --gradient_accumulation_steps 1 \
 86 |     --evaluation_strategy "no" \
 87 |     --save_strategy "epoch" \
 88 |     --save_steps 1500 \
 89 |     --learning_rate 5e-6 \
 90 |     --weight_decay 0. \
 91 |     --warmup_ratio 0.03 \
 92 |     --lr_scheduler_type "cosine" \
 93 |     --logging_steps 1 \
 94 |     --tf32 True \
 95 |     --model_max_length 4096 \
 96 |     --gradient_checkpointing True \
 97 |     --dataloader_num_workers 8 \
 98 |     --lazy_preprocess True \
 99 |     --report_to wandb
100 | 
101 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Sequence, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision.transforms.functional as F
  6 | 
  7 | from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, CenterCrop
  8 | 
  9 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 10 | 
 11 | 
 12 | class ResizeMaxSize(nn.Module):
 13 | 
 14 |     def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn="max", fill=0):
 15 |         super().__init__()
 16 |         if not isinstance(max_size, int):
 17 |             raise TypeError(f"Size should be int. Got {type(max_size)}")
 18 |         self.max_size = max_size
 19 |         self.interpolation = interpolation
 20 |         self.fn = min if fn == "min" else min
 21 |         self.fill = fill
 22 | 
 23 |     def forward(self, img):
 24 |         if isinstance(img, torch.Tensor):
 25 |             height, width = img.shape[:2]
 26 |         else:
 27 |             width, height = img.size
 28 |         scale = self.max_size / float(max(height, width))
 29 |         if scale != 1.0:
 30 |             new_size = tuple(round(dim * scale) for dim in (height, width))
 31 |             img = F.resize(img, new_size, self.interpolation)
 32 |             pad_h = self.max_size - new_size[0]
 33 |             pad_w = self.max_size - new_size[1]
 34 |             img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill)
 35 |         return img
 36 | 
 37 | 
 38 | def _convert_to_rgb(image):
 39 |     return image.convert("RGB")
 40 | 
 41 | 
 42 | # class CatGen(nn.Module):
 43 | #     def __init__(self, num=4):
 44 | #         self.num = num
 45 | #     def mixgen_batch(image, text):
 46 | #         batch_size = image.shape[0]
 47 | #         index = np.random.permutation(batch_size)
 48 | 
 49 | #         cat_images = []
 50 | #         for i in range(batch_size):
 51 | #             # image mixup
 52 | #             image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
 53 | #             # text concat
 54 | #             text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
 55 | #         text = torch.stack(text)
 56 | #         return image, text
 57 | 
 58 | 
 59 | def image_transform(
 60 |     image_size: int,
 61 |     is_train: bool,
 62 |     mean: Optional[Tuple[float, ...]] = None,
 63 |     std: Optional[Tuple[float, ...]] = None,
 64 |     resize_longest_max: bool = False,
 65 |     fill_color: int = 0,
 66 | ):
 67 |     mean = mean or OPENAI_DATASET_MEAN
 68 |     if not isinstance(mean, (list, tuple)):
 69 |         mean = (mean,) * 3
 70 | 
 71 |     std = std or OPENAI_DATASET_STD
 72 |     if not isinstance(std, (list, tuple)):
 73 |         std = (std,) * 3
 74 | 
 75 |     if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
 76 |         # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
 77 |         image_size = image_size[0]
 78 | 
 79 |     normalize = Normalize(mean=mean, std=std)
 80 |     if is_train:
 81 |         return Compose(
 82 |             [
 83 |                 RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
 84 |                 _convert_to_rgb,
 85 |                 ToTensor(),
 86 |                 normalize,
 87 |             ]
 88 |         )
 89 |     else:
 90 |         if resize_longest_max:
 91 |             transforms = [ResizeMaxSize(image_size, fill=fill_color)]
 92 |         else:
 93 |             transforms = [
 94 |                 Resize(image_size, interpolation=InterpolationMode.BICUBIC),
 95 |                 CenterCrop(image_size),
 96 |             ]
 97 |         transforms.extend(
 98 |             [
 99 |                 _convert_to_rgb,
100 |                 ToTensor(),
101 |                 normalize,
102 |             ]
103 |         )
104 |         return Compose(transforms)
105 | 


--------------------------------------------------------------------------------
/llava/model/language_model/llava_mpt.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | 
 16 | from typing import Optional, Tuple
 17 | 
 18 | import torch
 19 | 
 20 | from transformers import AutoConfig, AutoModelForCausalLM, MptConfig, MptForCausalLM, MptModel, GenerationConfig
 21 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
 22 | 
 23 | 
 24 | class LlavaMptConfig(MptConfig):
 25 |     model_type = "llava_mpt"
 26 | 
 27 | 
 28 | class LlavaMptModel(LlavaMetaModel, MptModel):
 29 |     config_class = LlavaMptConfig
 30 | 
 31 |     def __init__(self, config: MptConfig):
 32 |         config.hidden_size = config.d_model
 33 |         super(LlavaMptModel, self).__init__(config)
 34 | 
 35 |     def embed_tokens(self, x):
 36 |         return self.wte(x)
 37 | 
 38 | 
 39 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
 40 |     config_class = LlavaMptConfig
 41 |     supports_gradient_checkpointing = True
 42 | 
 43 |     def __init__(self, config):
 44 |         super(MptForCausalLM, self).__init__(config)
 45 | 
 46 |         config.model_type = "llava_mpt"
 47 |         config.rope_scaling = None
 48 |         self.generation_config = GenerationConfig(
 49 |             temperature=0.0,
 50 |             max_new_tokens=1024,
 51 |             do_sample=False,
 52 |             top_p=None,
 53 |         )
 54 | 
 55 |         self.transformer = LlavaMptModel(config)
 56 |         self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 57 | 
 58 |         # Initialize weights and apply final processing
 59 |         self.post_init()
 60 | 
 61 |     def get_model(self):
 62 |         return self.transformer
 63 | 
 64 |     def _set_gradient_checkpointing(self, module, value=False):
 65 |         if isinstance(module, LlavaMptModel):
 66 |             module.gradient_checkpointing = value
 67 | 
 68 |     def forward(
 69 |         self,
 70 |         input_ids: Optional[torch.LongTensor] = None,
 71 |         past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
 72 |         attention_mask: Optional[torch.Tensor] = None,
 73 |         inputs_embeds: Optional[torch.Tensor] = None,
 74 |         labels: Optional[torch.Tensor] = None,
 75 |         use_cache: Optional[bool] = None,
 76 |         output_attentions: Optional[bool] = None,
 77 |         output_hidden_states: Optional[bool] = None,
 78 |         return_dict: Optional[bool] = None,
 79 |         cache_position=None,
 80 |         images=None,
 81 |     ):
 82 | 
 83 |         input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
 84 | 
 85 |         return super().forward(
 86 |             input_ids,
 87 |             past_key_values=past_key_values,
 88 |             attention_mask=attention_mask,
 89 |             inputs_embeds=inputs_embeds,
 90 |             labels=labels,
 91 |             use_cache=use_cache,
 92 |             output_attentions=output_attentions,
 93 |             output_hidden_states=output_hidden_states,
 94 |             return_dict=return_dict,
 95 |         )
 96 | 
 97 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
 98 |         images = kwargs.pop("images", None)
 99 |         _inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
100 |         _inputs["images"] = images
101 |         return _inputs
102 | 
103 | 
104 | AutoConfig.register("llava_mpt", LlavaMptConfig)
105 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img src="https://i.imgur.com/waxVImv.png" alt="Oryx Video-ChatGPT">
 3 | </p>
 4 | <!-- centred logo -->
 5 | <h1 align="left" style="margin:24px 0;">
 6 |   MIRA: A Novel Framework for Fusing Modalities in Medical RAG
 7 | </h1>
 8 | 
 9 | <!-- bottom full-width GIF -->
10 | <p align="center">
11 |     <img src="https://i.imgur.com/waxVImv.png" alt="Oryx Video-ChatGPT">
12 | </p>
13 | 
14 | <div align="center">
15 | 
16 | [![arXiv](https://img.shields.io/badge/arXiv-2507.07902-b31b1b)](https://arxiv.org/abs/2507.07902)
17 | [![Hugging Face](https://img.shields.io/badge/Dataset-HuggingFace-orange?logo=huggingface)](https://huggingface.co/datasets/Tajamul21/Agent-X)
18 | [![Download](https://img.shields.io/badge/Dataset-Download-blue?logo=cloud)](https://github.com/Tajamul21/Agent-X-Benchmark/releases/download/v0.1.0/agent-X_dataset.zip)
19 | [![Website](https://img.shields.io/badge/View-website-green)](#-)
20 | 
21 | </div>
22 | 
23 | 
24 | 
25 | 
26 | #### Authors: [Jinhong Wang](https://scholar.google.com/citations?user=Z_YOUR_ID)\*, [Tajamul Ashraf](https://www.tajamulashraf.com)\*, [Zongyan Han](https://scholar.google.com/citations?user=Z_YOUR_ID), [Jorma Laaksonen](https://people.aalto.fi/jorma.laaksonen), [Rao Muhammad Anwer](https://mbzuai.ac.ae/study/faculty/rao-muhammad-anwer/)
27 | 
28 | 
29 | \* Equal contribution, **Correspondence:** [Tajamul Ashraf](https://www.tajamulashraf.com)
30 | <div align="left" style="margin:24px 0;">
31 |   <img src="https://user-images.githubusercontent.com/74038190/212284115-f47cd8ff-2ffb-4b04-b5bf-4d1c14c0247f.gif"
32 |        width="100%" />
33 | </div>
34 | 
35 | 
36 | 
37 | ## Updates
38 | 
39 | - **[2025-07-09]**: 🎉 MIRA paper **accepted at [ACM Multimedia 2025](https://acmmm.org/2025)**  
40 | - **[2025-06-02]**: MIRA paper published on [arXiv:2507.07902](https://arxiv.org/abs/2507.07902)  
41 | - **[2025-05-29]**: Released **evaluation & deployment code** for MIRA  
42 | - **[2025-05-22]**: Published the **MIRA dataset on [Hugging Face](https://huggingface.co/datasets)**  
43 | 
44 | 
45 | ## Introduction
46 | 
47 | Multimodal Large Language Models (MLLMs) have significantly advanced AI-assisted medical diagnosis, but they often generate factually inconsistent responses that deviate from established medical knowledge. Retrieval-Augmented Generation (RAG) enhances factual accuracy by integrating external sources, but it presents two key challenges. First, insufficient retrieval can miss critical information, whereas excessive retrieval can introduce irrelevant or misleading content, disrupting model output. Second, even when the model initially provides correct answers, over-reliance on retrieved data can lead to factual errors. 
48 | 
49 | <div align="center">
50 |  <img src="images/teasor.png" width="800"/>
51 | </div>
52 | 
53 | ## What is MIRA?
54 | 
55 | We introduce the Multimodal Intelligent Retrieval and Augmentation (MIRA) framework, designed to optimize factual accuracy in MLLM. MIRA consists of two key components: (1) a calibrated Rethinking and Rearrangement module that dynamically adjusts the number of retrieved contexts to manage factual risk, and (2) A medical RAG framework integrating image embeddings and a medical knowledge base with a query-rewrite module for efficient multimodal reasoning. This enables the model to integrate both its inherent knowledge and external references effectively. Our evaluation of publicly available medical VQA and report generation benchmarks demonstrates that MIRA substantially enhances factual accuracy and overall performance, achieving new state-of-the-art results.
56 | 
57 | 
58 | 
59 | 
60 | <div align="center">
61 |  <img src="images/architecture.png" width="800"/>
62 | </div>
63 | 
64 | 
65 | 
66 | 
67 | ---
68 | 
69 | ### Evaluation Scripts  
70 | 
71 | To be released...
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | ## 📝 Citation
79 | If you use miRA in your research, please cite the following paper:
80 | ```
81 | @misc{mira,
82 |       title={MIRA: A Novel Framework for Fusing Modalities in Medical RAG}, 
83 |       author={Jinhong Wang and Tajamul Ashraf and Zongyan Han and Jorma Laaksonen and Rao Mohammad Anwer},
84 |       year={2025},
85 |       eprint={2507.07902},
86 |       archivePrefix={arXiv},
87 |       primaryClass={cs.CV},
88 |       url={https://arxiv.org/abs/2507.07902}, 
89 | }
90 | 
91 | ```
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------