├── llava ├── serve │ ├── __init__.py │ ├── examples │ │ ├── waterview.jpg │ │ └── extreme_ironing.jpg │ ├── register_worker.py │ └── test_message.py ├── eval │ ├── .ipynb_checkpoints │ │ ├── pope-checkpoint.txt │ │ ├── note-checkpoint.txt │ │ ├── eval_textvqa-checkpoint.py │ │ ├── qa_baseline_gpt35-checkpoint.py │ │ ├── eval_pope-checkpoint.py │ │ ├── model_vqa-checkpoint.py │ │ └── model_vqa_science-checkpoint.py │ ├── webpage │ │ ├── figures │ │ │ ├── alpaca.png │ │ │ ├── bard.jpg │ │ │ ├── llama.jpg │ │ │ ├── vicuna.jpeg │ │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ │ └── chatgpt.svg │ │ └── styles.css │ ├── pope.txt │ ├── __pycache__ │ │ ├── model_vqa.cpython-310.pyc │ │ ├── eval_textvqa.cpython-310.pyc │ │ ├── m4c_evaluator.cpython-310.pyc │ │ ├── model_vqa_loader.cpython-310.pyc │ │ ├── model_vqa_mmbench.cpython-310.pyc │ │ └── model_vqa_science.cpython-310.pyc │ ├── table │ │ ├── reviewer.jsonl │ │ ├── model.jsonl │ │ └── prompt.jsonl │ ├── note.txt │ ├── eval_textvqa.py │ ├── qa_baseline_gpt35.py │ ├── summarize_gpt_review.py │ ├── model_qa.py │ ├── eval_pope.py │ ├── eval_gpt_review.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa.py │ ├── model_vqa.py │ ├── generate_webpage_data_from_table.py │ ├── eval_gpt_review_visual.py │ ├── eval_gpt_review_bench.py │ ├── model_vqa_science.py │ └── run_llava.py ├── __init__.py ├── __pycache__ │ ├── utils.cpython-310.pyc │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-38.pyc │ ├── mm_utils.cpython-310.pyc │ ├── constants.cpython-310.pyc │ └── conversation.cpython-310.pyc ├── model │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── builder.cpython-310.pyc │ │ ├── __init__.cpython-310.pyc │ │ └── llava_arch.cpython-310.pyc │ ├── language_model │ │ ├── __pycache__ │ │ │ ├── SCM.cpython-310.pyc │ │ │ ├── tome.cpython-310.pyc │ │ │ ├── FiCoCo_L.cpython-310.pyc │ │ │ ├── llava_mpt.cpython-310.pyc │ │ │ ├── text_tome.cpython-310.pyc │ │ │ ├── llava_llama.cpython-310.pyc │ │ │ ├── llava_llama.cpython-38.pyc │ │ │ └── llava_mistral.cpython-310.pyc │ │ └── llava_mpt.py │ ├── multimodal_encoder │ │ ├── __pycache__ │ │ │ ├── SCM.cpython-310.pyc │ │ │ ├── tome.cpython-310.pyc │ │ │ ├── FiCoCo_V.cpython-310.pyc │ │ │ ├── builder.cpython-310.pyc │ │ │ └── clip_encoder.cpython-310.pyc │ │ ├── builder.py │ │ └── .ipynb_checkpoints │ │ │ └── builder-checkpoint.py │ ├── multimodal_projector │ │ ├── __pycache__ │ │ │ └── builder.cpython-310.pyc │ │ ├── builder.py │ │ └── .ipynb_checkpoints │ │ │ └── builder-checkpoint.py │ ├── __init__.py │ ├── utils.py │ ├── .ipynb_checkpoints │ │ ├── utils-checkpoint.py │ │ ├── apply_delta-checkpoint.py │ │ └── make_delta-checkpoint.py │ ├── consolidate.py │ ├── apply_delta.py │ └── make_delta.py ├── train │ ├── train_mem.py │ ├── train_xformers.py │ └── llama_flash_attn_monkey_patch.py ├── constants.py └── utils.py ├── fig1.png ├── intro.png ├── main.png ├── eff_ana.png ├── main_new.png ├── method.png ├── visual.png ├── efficiency_analysis.jpg ├── scripts ├── upload_pypi.sh ├── convert_mmvet_for_eval.py ├── convert_gqa_for_eval.py ├── sqa_eval_batch.sh ├── v1_5 │ ├── eval │ │ ├── mme.sh │ │ ├── textvqa.sh │ │ ├── .ipynb_checkpoints │ │ │ ├── mme-checkpoint.sh │ │ │ ├── textvqa-checkpoint.sh │ │ │ ├── mmvet-checkpoint.sh │ │ │ ├── qbench-checkpoint.sh │ │ │ ├── vizwiz-checkpoint.sh │ │ │ ├── qbench_zh-checkpoint.sh │ │ │ ├── sqa-checkpoint.sh │ │ │ ├── mmbench-checkpoint.sh │ │ │ ├── mmbench_cn-checkpoint.sh │ │ │ ├── pope-checkpoint.sh │ │ │ ├── llavabench-checkpoint.sh │ │ │ ├── vqav2-checkpoint.sh │ │ │ ├── seed-checkpoint.sh │ │ │ └── sqa_multi-checkpoint.sh │ │ ├── mmvet.sh │ │ ├── qbench.sh │ │ ├── vizwiz.sh │ │ ├── qbench_zh.sh │ │ ├── sqa.sh │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── pope.sh │ │ ├── llavabench.sh │ │ ├── vqav2.sh │ │ ├── gqa.sh │ │ ├── seed.sh │ │ └── sqa_multi.sh │ ├── finetune_task.sh │ ├── pretrain.sh │ ├── finetune.sh │ ├── finetune_task_lora.sh │ └── finetune_lora.sh ├── sqa_eval_gather.sh ├── zero2.json ├── merge_lora_weights.py ├── zero3.json ├── convert_mmbench_for_submission.py ├── finetune_sqa.sh ├── zero3_offload.json ├── pretrain_xformers.sh ├── convert_vizwiz_for_submission.py ├── pretrain.sh ├── extract_mm_projector.py ├── finetune.sh ├── finetune_full_schedule.sh ├── finetune_lora.sh ├── finetune_qlora.sh ├── convert_vqav2_for_submission.py ├── convert_seed_for_submission.py └── convert_sqa_to_llava.py ├── cog.yaml ├── .ipynb_checkpoints ├── cog-checkpoint.yaml ├── pyproject-checkpoint.toml └── README-checkpoint.md └── pyproject.toml /llava/serve/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/pope-checkpoint.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/fig1.png -------------------------------------------------------------------------------- /intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/intro.png -------------------------------------------------------------------------------- /main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/main.png -------------------------------------------------------------------------------- /eff_ana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/eff_ana.png -------------------------------------------------------------------------------- /main_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/main_new.png -------------------------------------------------------------------------------- /method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/method.png -------------------------------------------------------------------------------- /visual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/visual.png -------------------------------------------------------------------------------- /efficiency_analysis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/efficiency_analysis.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /llava/__pycache__/mm_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/mm_utils.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/pope.txt: -------------------------------------------------------------------------------- 1 | Dakota 2 | Drupalcon 3 | Stoneground 4 | Morangie 5 | 10 years 6 | 2222 7 | 10:50 8 | Rolex 9 | Boration 10 | -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/__pycache__/constants.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/constants.cpython-310.pyc -------------------------------------------------------------------------------- /llava/__pycache__/conversation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/__pycache__/conversation.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/__pycache__/model_vqa.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/model_vqa.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/__pycache__/eval_textvqa.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/eval_textvqa.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/__pycache__/m4c_evaluator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/m4c_evaluator.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__pycache__/llava_arch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/__pycache__/llava_arch.cpython-310.pyc -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /llava/eval/__pycache__/model_vqa_loader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/model_vqa_loader.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/__pycache__/model_vqa_mmbench.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/model_vqa_mmbench.cpython-310.pyc -------------------------------------------------------------------------------- /llava/eval/__pycache__/model_vqa_science.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/eval/__pycache__/model_vqa_science.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/SCM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/SCM.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/tome.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/tome.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/SCM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_encoder/__pycache__/SCM.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/FiCoCo_L.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/FiCoCo_L.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/llava_mpt.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/text_tome.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/text_tome.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/tome.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_encoder/__pycache__/tome.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/llava_llama.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_llama.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/llava_llama.cpython-38.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/FiCoCo_V.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_encoder/__pycache__/FiCoCo_V.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/language_model/__pycache__/llava_mistral.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_projector/__pycache__/builder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kawhiiiileo/FiCoCo/HEAD/llava/model/multimodal_encoder/__pycache__/clip_encoder.cpython-310.pyc -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | except: 6 | pass 7 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /scripts/upload_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Step 0: Clean up 4 | rm -rf dist 5 | 6 | # Step 1: Change the package name to "llava-torch" 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml 8 | 9 | # Step 2: Build the package 10 | python -m build 11 | 12 | # Step 3: Revert the changes in pyproject.toml to the original 13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml 14 | 15 | # Step 4: Upload to PyPI 16 | python -m twine upload dist/* 17 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | cd ./playground/data/eval/MME 12 | 13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b 14 | 15 | cd eval_tool 16 | 17 | python calculation.py --results_dir answers/llava-v1.5-13b 18 | -------------------------------------------------------------------------------- /scripts/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder ./playground/data/eval/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 14 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/mme-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | cd ./playground/data/eval/MME 12 | 13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b 14 | 15 | cd eval_tool 16 | 17 | python calculation.py --results_dir answers/llava-v1.5-13b -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/textvqa-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder ./playground/data/eval/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 14 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path ./llava-v1.5-7b 5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 6 | --image-folder ./playground/data/eval/mm-vet/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json 16 | 17 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | echo "Evaluating in 'dev' split." 5 | elif [ "$1" = "test" ]; then 6 | echo "Evaluating in 'test' split." 7 | else 8 | echo "Unknown split, please choose between 'dev' and 'test'." 9 | exit 1 10 | fi 11 | 12 | python -m llava.eval.model_vqa_qbench \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 17 | --conv-mode llava_v1 \ 18 | --lang en 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/mmvet-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path ./llava-v1.5-7b 5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 6 | --image-folder ./playground/data/eval/mm-vet/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json 16 | 17 | -------------------------------------------------------------------------------- /llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/qbench-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | echo "Evaluating in 'dev' split." 5 | elif [ "$1" = "test" ]; then 6 | echo "Evaluating in 'test' split." 7 | else 8 | echo "Unknown split, please choose between 'dev' and 'test'." 9 | exit 1 10 | fi 11 | 12 | python -m llava.eval.model_vqa_qbench \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 17 | --conv-mode llava_v1 \ 18 | --lang en 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder ./playground/data/eval/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | ZH_SPLIT="验证集" 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | ZH_SPLIT="测试集" 8 | echo "Evaluating in 'test' split." 9 | else 10 | echo "Unknown split, please choose between 'dev' and 'test'." 11 | exit 1 12 | fi 13 | 14 | python -m llava.eval.model_vqa_qbench \ 15 | --model-path liuhaotian/llava-v1.5-13b \ 16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 19 | --conv-mode llava_v1 \ 20 | --lang zh 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/vizwiz-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder ./playground/data/eval/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/qbench_zh-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | ZH_SPLIT="验证集" 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | ZH_SPLIT="测试集" 8 | echo "Evaluating in 'test' split." 9 | else 10 | echo "Unknown split, please choose between 'dev' and 'test'." 11 | exit 1 12 | fi 13 | 14 | python -m llava.eval.model_vqa_qbench \ 15 | --model-path liuhaotian/llava-v1.5-13b \ 16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 19 | --conv-mode llava_v1 \ 20 | --lang zh 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 6 | --image-folder ./playground/data/eval/scienceqa/images/test \ 7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 8 | --single-pred-prompt \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python llava/eval/eval_science_qa.py \ 13 | --base-dir ./playground/data/eval/scienceqa \ 14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \ 16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/sqa-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path ./llava-v1.5-7b \ 5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 6 | --image-folder ./playground/data/eval/scienceqa/images/test \ 7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 8 | --single-pred-prompt \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python llava/eval/eval_science_qa.py \ 13 | --base-dir ./playground/data/eval/scienceqa \ 14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \ 16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path ./llava-v1.5-7b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/mmbench_dev_20230712.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/mmbench_dev_20230712 \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712 \ 19 | --experiment llava-v1.5-13b 20 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_cn_20231003" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --lang cn \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 20 | --experiment llava-v1.5-13b 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path /root/autodl-tmp/workplace/LLaVA/llava-v1.5-7b \ 5 | --question-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/llava_pope_test.jsonl \ 6 | --image-folder /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/val2014 \ 7 | --answers-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/coco \ 13 | --question-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/llava_pope_test.jsonl \ 14 | --result-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/answers/llava-v1.5-13b.jsonl 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/mmbench-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path ./llava-v1.5-7b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/mmbench_dev_20230712.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/mmbench_dev_20230712 \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712 \ 19 | --experiment llava-v1.5-13b 20 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/.ipynb_checkpoints/builder-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | 15 | raise ValueError(f'Unknown vision tower: {vision_tower}') 16 | -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/mmbench_cn-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_cn_20231003" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --lang cn \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 20 | --experiment llava-v1.5-13b 21 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/pope-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path /root/autodl-tmp/workplace/LLaVA/llava-v1.5-7b \ 5 | --question-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/llava_pope_test.jsonl \ 6 | --image-folder /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/val2014 \ 7 | --answers-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/coco \ 13 | --question-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/llava_pope_test.jsonl \ 14 | --result-file /root/autodl-tmp/workplace/LLaVA/playground/data/eval/pope/answers/llava-v1.5-13b.jsonl 15 | -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/model/.ipynb_checkpoints/utils-checkpoint.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/cog-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/llavabench-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | 37 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/vqav2-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | 37 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path liuhaotian/llava-v1.5-13b \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --mm_vision_select_layer -2 \ 12 | --mm_use_im_start_end False \ 13 | --mm_use_im_patch_token False \ 14 | --image_aspect_ratio pad \ 15 | --group_by_modality_length True \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-v1.5-13b-task \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version plain \ 7 | --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./playground/data/LLaVA-Pretrain/images \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path liuhaotian/llava-v1.5-13b \ 15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/gqa/data/images \ 17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 37 | 38 | cd $GQADIR 39 | python eval/eval.py --tier testdev_balanced 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.2.2.post1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.1.2", "torchvision==0.16.2", 17 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://llava-vl.github.io" 31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path liuhaotian/llava-v1.5-13b \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b-task-lora \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-4 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/pyproject-checkpoint.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "llava" 7 | version = "1.2.2.post1" 8 | description = "Towards GPT-4 like large language and visual assistant." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.1.2", "torchvision==0.16.2", 17 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft", "bitsandbytes", 19 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==4.16.0", "gradio_client==0.8.1", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.12.6", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://llava-vl.github.io" 31 | "Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 38 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 12 | --model-path liuhaotian/llava-v1.5-13b \ 13 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \ 14 | --image-folder ./playground/data/eval/seed_bench \ 15 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 25 | 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | # Evaluate 35 | python scripts/convert_seed_for_submission.py \ 36 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 37 | --result-file $output_file \ 38 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl 39 | 40 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/seed-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 12 | --model-path liuhaotian/llava-v1.5-13b \ 13 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \ 14 | --image-folder ./playground/data/eval/seed_bench \ 15 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 25 | 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | # Evaluate 35 | python scripts/convert_seed_for_submission.py \ 36 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 37 | --result-file $output_file \ 38 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl 39 | 40 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 12 | --mm_projector_type mlp2x_gelu \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --image_aspect_ratio pad \ 17 | --group_by_modality_length True \ 18 | --bf16 True \ 19 | --output_dir ./checkpoints/llava-v1.5-13b-lora \ 20 | --num_train_epochs 1 \ 21 | --per_device_train_batch_size 16 \ 22 | --per_device_eval_batch_size 4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --evaluation_strategy "no" \ 25 | --save_strategy "steps" \ 26 | --save_steps 50000 \ 27 | --save_total_limit 1 \ 28 | --learning_rate 2e-4 \ 29 | --weight_decay 0. \ 30 | --warmup_ratio 0.03 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --tf32 True \ 34 | --model_max_length 2048 \ 35 | --gradient_checkpointing True \ 36 | --dataloader_num_workers 4 \ 37 | --lazy_preprocess True \ 38 | --report_to wandb 39 | -------------------------------------------------------------------------------- /scripts/finetune_sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | deepspeed llava/train/train_mem.py \ 6 | --deepspeed ./scripts/zero2.json \ 7 | --model_name_or_path lmsys/vicuna-13b-v1.3 \ 8 | --version $PROMPT_VERSION \ 9 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ 10 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \ 11 | --vision_tower openai/clip-vit-large-patch14 \ 12 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ 18 | --num_train_epochs 12 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /scripts/pretrain_xformers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | # MODEL_VERSION=vicuna-v1-3-7b 6 | # MODEL_VERSION=llama-2-7b-chat 7 | 8 | ########### DO NOT CHANGE ########### 9 | ########### USE THIS FOR BOTH ########### 10 | PROMPT_VERSION=plain 11 | ########### DO NOT CHANGE ########### 12 | 13 | deepspeed llava/train/train_xformers.py \ 14 | --deepspeed ./scripts/zero2.json \ 15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 16 | --version $PROMPT_VERSION \ 17 | --data_path /path/to/pretrain_data.json \ 18 | --image_folder /path/to/images \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 False \ 25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 4 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 2e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 False \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/sqa_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 获取GPU列表,如果没有指定CUDA_VISIBLE_DEVICES则默认使用GPU 0 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | # 计算GPU的数量 8 | CHUNKS=${#GPULIST[@]} 9 | 10 | # 设置模型路径和数据文件路径 11 | CKPT="llava-v1.5-13b" 12 | SPLIT="llava_test_CQM-A" 13 | 14 | # 对每个GPU执行模型推理任务 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_science \ 17 | --model-path ./llava-v1.5-7b \ 18 | --question-file ./playground/data/eval/scienceqa/$SPLIT.json \ 19 | --image-folder ./playground/data/eval/scienceqa/images/test \ 20 | --answers-file ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 21 | --single-pred-prompt \ 22 | --temperature 0 \ 23 | --conv-mode vicuna_v1 \ 24 | --num-chunks $CHUNKS \ 25 | --chunk-idx $IDX & 26 | done 27 | 28 | # 等待所有GPU任务完成 29 | wait 30 | 31 | # 合并各个GPU生成的分块文件 32 | output_file=./playground/data/eval/scienceqa/answers/$CKPT/merge.jsonl 33 | 34 | # 清空输出文件(如果已存在) 35 | > "$output_file" 36 | 37 | # 遍历每个分块文件并将其内容追加到输出文件中 38 | for IDX in $(seq 0 $((CHUNKS-1))); do 39 | cat ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 40 | done 41 | 42 | # 执行评估脚本,将答案文件转化为最终结果文件 43 | python llava/eval/eval_science_qa.py \ 44 | --base-dir ./playground/data/eval/scienceqa \ 45 | --result-file $output_file \ 46 | --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \ 47 | --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json 48 | 49 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x['question_id']: x['text'] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x['question_id'] for x in test_split]) 32 | 33 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | assert x['question_id'] in results 41 | all_answers.append({ 42 | 'image': x['image'], 43 | 'answer': answer_processor(results[x['question_id']]) 44 | }) 45 | 46 | with open(args.result_upload_file, 'w') as f: 47 | json.dump(all_answers, f) 48 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/.ipynb_checkpoints/sqa_multi-checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 获取GPU列表,如果没有指定CUDA_VISIBLE_DEVICES则默认使用GPU 0 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | # 计算GPU的数量 8 | CHUNKS=${#GPULIST[@]} 9 | 10 | # 设置模型路径和数据文件路径 11 | CKPT="llava-v1.5-13b" 12 | SPLIT="llava_test_CQM-A" 13 | 14 | # 对每个GPU执行模型推理任务 15 | for IDX in $(seq 0 $((CHUNKS-1))); do 16 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_science \ 17 | --model-path ./llava-v1.5-7b \ 18 | --question-file ./playground/data/eval/scienceqa/$SPLIT.json \ 19 | --image-folder ./playground/data/eval/scienceqa/images/test \ 20 | --answers-file ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 21 | --single-pred-prompt \ 22 | --temperature 0 \ 23 | --conv-mode vicuna_v1 \ 24 | --num-chunks $CHUNKS \ 25 | --chunk-idx $IDX & 26 | done 27 | 28 | # 等待所有GPU任务完成 29 | wait 30 | 31 | # 合并各个GPU生成的分块文件 32 | output_file=./playground/data/eval/scienceqa/answers/$CKPT/merge.jsonl 33 | 34 | # 清空输出文件(如果已存在) 35 | > "$output_file" 36 | 37 | # 遍历每个分块文件并将其内容追加到输出文件中 38 | for IDX in $(seq 0 $((CHUNKS-1))); do 39 | cat ./playground/data/eval/scienceqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 40 | done 41 | 42 | # 执行评估脚本,将答案文件转化为最终结果文件 43 | python llava/eval/eval_science_qa.py \ 44 | --base-dir ./playground/data/eval/scienceqa \ 45 | --result-file $output_file \ 46 | --output-file ./playground/data/eval/scienceqa/answers/${CKPT}_output.jsonl \ 47 | --output-result ./playground/data/eval/scienceqa/answers/${CKPT}_result.json 48 | 49 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | # MODEL_VERSION=vicuna-v1-3-7b 8 | # MODEL_VERSION=llama-2-7b-chat 9 | 10 | ########### DO NOT CHANGE ########### 11 | ########### USE THIS FOR BOTH ########### 12 | PROMPT_VERSION=plain 13 | ########### DO NOT CHANGE ########### 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 18 | --version $PROMPT_VERSION \ 19 | --data_path /path/to/pretrain_data.json \ 20 | --image_folder /path/to/images \ 21 | --vision_tower openai/clip-vit-large-patch14 \ 22 | --tune_mm_mlp_adapter True \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 16 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 1 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 2e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to wandb 47 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/.ipynb_checkpoints/builder-checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /scripts/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_80k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_full_schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_158k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 3 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 21 | --version $PROMPT_VERSION \ 22 | --data_path ./playground/data/llava_instruct_80k.json \ 23 | --image_folder /path/to/coco/train2017 \ 24 | --vision_tower openai/clip-vit-large-patch14 \ 25 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 26 | --mm_vision_select_layer -2 \ 27 | --mm_use_im_start_end False \ 28 | --mm_use_im_patch_token False \ 29 | --bf16 True \ 30 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 16 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 50000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --dataloader_num_workers 4 \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/finetune_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --bits 4 \ 21 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 22 | --version $PROMPT_VERSION \ 23 | --data_path ./playground/data/llava_instruct_80k.json \ 24 | --image_folder /path/to/coco/train2017 \ 25 | --vision_tower openai/clip-vit-large-patch14 \ 26 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 27 | --mm_vision_select_layer -2 \ 28 | --mm_use_im_start_end False \ 29 | --mm_use_im_patch_token False \ 30 | --bf16 True \ 31 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 32 | --num_train_epochs 1 \ 33 | --per_device_train_batch_size 16 \ 34 | --per_device_eval_batch_size 4 \ 35 | --gradient_accumulation_steps 1 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 50000 \ 39 | --save_total_limit 1 \ 40 | --learning_rate 2e-5 \ 41 | --weight_decay 0. \ 42 | --warmup_ratio 0.03 \ 43 | --lr_scheduler_type "cosine" \ 44 | --logging_steps 1 \ 45 | --tf32 True \ 46 | --model_max_length 2048 \ 47 | --gradient_checkpointing True \ 48 | --lazy_preprocess True \ 49 | --dataloader_num_workers 4 \ 50 | --report_to wandb 51 | -------------------------------------------------------------------------------- /scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /llava/eval/note.txt: -------------------------------------------------------------------------------- 1 | 取前10条数据 2 | 1. 仅对text token进行合并<由于text token数量较少, 取r=1, 2, 3, 4>进行结果分析 3 | 当r=1, 前十条数据的输出如下: 4 | Dakota 5 | Drupalcon 6 | Ston 7 | Bowmore 8 | 10 years 9 | 22 10 | 12:15 11 | Omega 12 | Ida org 13 | Philippe molitor 14 | 当r=2, 前十条数据的输出如下: 15 | Dakota 16 | Drupalcon 17 | Ston 18 | Bowmore 19 | 10 years 20 | 222 21 | 10:55 22 | Omega 23 | Ida org 24 | Philippe Molitor 25 | 当r=3, 前十条数据的输出如下: 26 | Dakota Digital 27 | Drupalcon 28 | Ston 29 | Bowmore 30 | 10 years 31 | 22222 32 | 10: 10:55 33 | Omega 34 | Ida org 35 | Philippe Molitor 36 | 当r=4, 前十条数据的输出如下: 37 | Dakota 38 | Drupalcon 39 | Stonely Self-Righteous 40 | Bowmore 41 | 10 42 | 222222 43 | 10: 10:55 44 | Omega 45 | Ida org 46 | Philippe Molitor 47 | 2. 仅对visual token进行合并进行结果分析 48 | r=0 49 | Dakota digital 50 | Drupalcon 51 | Ston 52 | Bowmore 53 | 10 years 54 | 22 55 | 11:55 56 | Omega 57 | Ida org 58 | Philippe molitor 59 | r=2 60 | Dakota 61 | Drupalcon 62 | Ston 63 | Morangie 64 | 10 years 65 | 22 66 | 11:55 67 | Omega 68 | Ida org 69 | Philippe Molitor 70 | r=5 71 | Dakota digital 72 | Drupalcon 73 | Self-righteous 74 | Bowmore 75 | 10 76 | 222222222 77 | 10: 112:20 78 | Omega 79 | Ida org 80 | Philippe Molitor 81 | r=10 82 | Dakota digital 83 | Copenhagen 84 | Ston 85 | Bowmore 86 | 10 87 | 22222222222222222222 88 | 112:500:555:500:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5 89 | Omega 90 | Ida Org 91 | Philippe Molitor 92 | r=15 93 | Dakota 94 | Drupalcon 95 | Stone 96 | Bowmore 97 | 10 years 98 | 2222222222222222222222222222222 99 | 111:50000:50:50:50:50:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5 100 | Omega 101 | Ida 102 | Philippe Molitor 103 | -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/note-checkpoint.txt: -------------------------------------------------------------------------------- 1 | 取前10条数据 2 | 1. 仅对text token进行合并<由于text token数量较少, 取r=1, 2, 3, 4>进行结果分析 3 | 当r=1, 前十条数据的输出如下: 4 | Dakota 5 | Drupalcon 6 | Ston 7 | Bowmore 8 | 10 years 9 | 22 10 | 12:15 11 | Omega 12 | Ida org 13 | Philippe molitor 14 | 当r=2, 前十条数据的输出如下: 15 | Dakota 16 | Drupalcon 17 | Ston 18 | Bowmore 19 | 10 years 20 | 222 21 | 10:55 22 | Omega 23 | Ida org 24 | Philippe Molitor 25 | 当r=3, 前十条数据的输出如下: 26 | Dakota Digital 27 | Drupalcon 28 | Ston 29 | Bowmore 30 | 10 years 31 | 22222 32 | 10: 10:55 33 | Omega 34 | Ida org 35 | Philippe Molitor 36 | 当r=4, 前十条数据的输出如下: 37 | Dakota 38 | Drupalcon 39 | Stonely Self-Righteous 40 | Bowmore 41 | 10 42 | 222222 43 | 10: 10:55 44 | Omega 45 | Ida org 46 | Philippe Molitor 47 | 2. 仅对visual token进行合并进行结果分析 48 | r=0 49 | Dakota digital 50 | Drupalcon 51 | Ston 52 | Bowmore 53 | 10 years 54 | 22 55 | 11:55 56 | Omega 57 | Ida org 58 | Philippe molitor 59 | r=2 60 | Dakota 61 | Drupalcon 62 | Ston 63 | Morangie 64 | 10 years 65 | 22 66 | 11:55 67 | Omega 68 | Ida org 69 | Philippe Molitor 70 | r=5 71 | Dakota digital 72 | Drupalcon 73 | Self-righteous 74 | Bowmore 75 | 10 76 | 222222222 77 | 10: 112:20 78 | Omega 79 | Ida org 80 | Philippe Molitor 81 | r=10 82 | Dakota digital 83 | Copenhagen 84 | Ston 85 | Bowmore 86 | 10 87 | 22222222222222222222 88 | 112:500:555:500:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5:5 89 | Omega 90 | Ida Org 91 | Philippe Molitor 92 | r=15 93 | Dakota 94 | Drupalcon 95 | Stone 96 | Bowmore 97 | 10 years 98 | 2222222222222222222222222222222 99 | 111:50000:50:50:50:50:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5:0:5 100 | Omega 101 | Ida 102 | Philippe Molitor 103 | -------------------------------------------------------------------------------- /llava/model/.ipynb_checkpoints/apply_delta-checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /llava/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/model/.ipynb_checkpoints/make_delta-checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/eval_textvqa-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) 65 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/qa_baseline_gpt35-checkpoint.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | if file =='.ipynb_checkpoints': 76 | continue 77 | assert file.startswith('coco_pope_') 78 | assert file.endswith('.json') 79 | category = file[10:-5] 80 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 81 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 82 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 83 | print("====================================") 84 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/eval_pope-checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | if file =='.ipynb_checkpoints': 76 | continue 77 | assert file.startswith('coco_pope_') 78 | assert file.endswith('.json') 79 | category = file[10:-5] 80 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 81 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 82 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 83 | print("====================================") 84 | -------------------------------------------------------------------------------- /scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /llava/model/language_model/llava_mpt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Optional, Tuple 17 | 18 | import torch 19 | 20 | from transformers import AutoConfig, AutoModelForCausalLM, \ 21 | MptConfig, MptForCausalLM, MptModel 22 | from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM 23 | 24 | 25 | class LlavaMptConfig(MptConfig): 26 | model_type = "llava_mpt" 27 | 28 | 29 | class LlavaMptModel(LlavaMetaModel, MptModel): 30 | config_class = LlavaMptConfig 31 | 32 | def __init__(self, config: MptConfig): 33 | config.hidden_size = config.d_model 34 | super(LlavaMptModel, self).__init__(config) 35 | 36 | def embed_tokens(self, x): 37 | return self.wte(x) 38 | 39 | 40 | class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): 41 | config_class = LlavaMptConfig 42 | supports_gradient_checkpointing = True 43 | 44 | def __init__(self, config): 45 | super(MptForCausalLM, self).__init__(config) 46 | 47 | self.transformer = LlavaMptModel(config) 48 | self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) 49 | 50 | # Initialize weights and apply final processing 51 | self.post_init() 52 | 53 | def get_model(self): 54 | return self.transformer 55 | 56 | def _set_gradient_checkpointing(self, module, value=False): 57 | if isinstance(module, LlavaMptModel): 58 | module.gradient_checkpointing = value 59 | 60 | def forward( 61 | self, 62 | input_ids: Optional[torch.LongTensor] = None, 63 | past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, 64 | attention_mask: Optional[torch.Tensor] = None, 65 | inputs_embeds: Optional[torch.Tensor] = None, 66 | labels: Optional[torch.Tensor] = None, 67 | use_cache: Optional[bool] = None, 68 | output_attentions: Optional[bool] = None, 69 | output_hidden_states: Optional[bool] = None, 70 | return_dict: Optional[bool] = None, 71 | images=None): 72 | 73 | input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) 74 | 75 | return super().forward( 76 | input_ids, 77 | past_key_values=past_key_values, 78 | attention_mask=attention_mask, 79 | inputs_embeds=inputs_embeds, 80 | labels=labels, 81 | use_cache=use_cache, 82 | output_attentions=output_attentions, 83 | output_hidden_states=output_hidden_states, 84 | return_dict=return_dict, 85 | ) 86 | 87 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 88 | images = kwargs.pop("images", None) 89 | _inputs = super().prepare_inputs_for_generation( 90 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 91 | ) 92 | _inputs['images'] = images 93 | return _inputs 94 | 95 | 96 | AutoConfig.register("llava_mpt", LlavaMptConfig) 97 | AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) 98 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/README-checkpoint.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Rethinking Token Reduction in MLLMs: 4 | Towards a Unified Paradigm for Training-Free Acceleration

5 | 6 |
7 | 8 | [Yuhang Han]()1* , 9 | [Xuyang Liu]()2*, 10 | [Pengxiang Ding]()3, 11 | [Donglin Wang]()3,\ 12 | [Honggang Chen]()2✉️, 13 | [QingSen Yan]()1✉️ 14 | [Siteng Huang]()4✉️ 15 | 16 | 1Northwestern Polytechnical University, 2Sichuan University,\ 17 | 3Westlake University, 4Zhejiang University 18 | 19 |

20 | 21 |

22 |
23 | 24 |

25 | mask 26 |

27 | 28 | ## 👀 Overview 29 | 30 | To accelerate the inference of heavy Multimodal Large Language Models (MLLMs), this study rethinks the current landscape of training-free token reduction research. We regret to find that the critical components of existing methods are tightly intertwined, with their interconnections and effects remaining unclear for comparison, transfer, and expansion. Therefore, we propose a unified ''filter-correlate-compress'' paradigm that decomposes the token reduction into three distinct stages within a pipeline, maintaining consistent design objectives and elements while allowing for unique implementations. We additionally demystify the popular works and subsume them into our paradigm to showcase its universality. Finally, we offer a suite of methods grounded in the paradigm, striking a balance between speed and accuracy throughout different phases of the inference. Experimental results across 10 benchmarks indicate that our methods can achieve up to an 82.4% reduction in FLOPs with a minimal impact on performance, simultaneously surpassing state-of-the-art training-free methods. 31 |
32 | image 33 |
34 | 35 | ## 👨 Preparation 36 | 37 | 1. Clone this repository. 38 | ```bash 39 | git clone https://github.com/kawhiiiileo/FiCoCo.git 40 | cd FiCoCo 41 | ``` 42 | 43 | 2. Environment Setup and Preparation 44 | ```Shell 45 | conda create -n FiCoCo python=3.10 -y 46 | conda activate FiCoCo 47 | pip install -e . 48 | ``` 49 | 50 | 3. Download Multimodal Benchmark 51 | 52 | Please follow the detailed instruction in [LLaVA-Evaluation](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md). 53 | 54 | 4. Download LLaVa and put it under ./liuhaotian/llava-v1.5-7b. 55 | 56 | [LLaVA-1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) 57 | 58 | [Open-LLaVA-Next](https://github.com/xiaoachen98/Open-LLaVA-NeXT) 59 | 60 | 61 | ## 🎯 Usage 62 | To configure the FiCoCo model with these parameters, update the corresponding settings in your code or configuration file. Below is an example configuration: 63 | 64 | For example:\ 65 | `merge_visual: true` # Enable FiCoCo-V for visual tokens compression\ 66 | `AT: true` # Enable FiCoCo-L for visual tokens compression\ 67 | `r: 42` # Compress 42 tokens per layer\ 68 | `control_encoding_layer: 11` # Start compression from the 12th transformer layer 69 | 70 | Example for evaluating SQA results (r=42, control_encoding_layer=11, merge_visual=True): 71 | ```Shell 72 | CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh 73 | ``` 74 | 75 | To calculate FLOPs, we can refer to the methodology presented in the work of [LLM-viewer](https://github.com/hahnyuan/LLM-Viewer/). We deeply appreciate their outstanding contribution to this field. 76 | 77 | ## License 78 | 79 | This project is released under the [Apache 2.0 license](LICENSE). 80 | 81 | ## Citation 82 | 83 | If you use FiCoCo in your research, please cite our work by using the following BibTeX entry: 84 | ```bibtex 85 | @article{FiCoCo2024, 86 | title={Rethinking Token Reduction in MLLMs: Towards a Unified Paradigm for Training-Free Acceleration}, 87 | author={Yuhang Han and Xuyang Liu and Pengxiang Ding and Donglin Wang and Honggang Chen and Qingsen Yan and Siteng Huang}, 88 | year={2024}, 89 | eprint={2411.17686}, 90 | archivePrefix={arXiv}, 91 | primaryClass={cs.CV} 92 | } 93 | 94 | ``` 95 | ## Acknowledgment 96 | 97 | We extend our gratitude to the open-source efforts of [LLaVA](https://github.com/haotian-liu/LLaVA), [ToMe](https://github.com/facebookresearch/ToMe/) and [Open-LLaVA-NeXT](https://github.com/xiaoachen98/Open-LLaVA-NeXT). 98 | -------------------------------------------------------------------------------- /llava/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 36 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 37 | answers_file = os.path.expanduser(args.answers_file) 38 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 39 | ans_file = open(answers_file, "w") 40 | for line in tqdm(questions): 41 | idx = line["question_id"] 42 | image_file = line["image"] 43 | qs = line["text"] 44 | cur_prompt = qs 45 | if model.config.mm_use_im_start_end: 46 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 47 | else: 48 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 49 | 50 | conv = conv_templates[args.conv_mode].copy() 51 | conv.append_message(conv.roles[0], qs) 52 | conv.append_message(conv.roles[1], None) 53 | prompt = conv.get_prompt() 54 | 55 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 56 | 57 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 58 | image_tensor = process_images([image], image_processor, model.config)[0] 59 | 60 | with torch.inference_mode(): 61 | output_ids = model.generate( 62 | input_ids, 63 | images=image_tensor.unsqueeze(0).half().cuda(), 64 | image_sizes=[image.size], 65 | do_sample=True if args.temperature > 0 else False, 66 | temperature=args.temperature, 67 | top_p=args.top_p, 68 | num_beams=args.num_beams, 69 | # no_repeat_ngram_size=3, 70 | max_new_tokens=1024, 71 | use_cache=True) 72 | 73 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 74 | 75 | ans_id = shortuuid.uuid() 76 | ans_file.write(json.dumps({"question_id": idx, 77 | "prompt": cur_prompt, 78 | "text": outputs, 79 | "answer_id": ans_id, 80 | "model_id": model_name, 81 | "metadata": {}}) + "\n") 82 | ans_file.flush() 83 | ans_file.close() 84 | 85 | if __name__ == "__main__": 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 88 | parser.add_argument("--model-base", type=str, default=None) 89 | parser.add_argument("--image-folder", type=str, default="") 90 | parser.add_argument("--question-file", type=str, default="/root/autodl-tmp/workplace/LLaVA/playground/data/eval/mm-vet/llava-mm-vet.jsonl") 91 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 92 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 93 | parser.add_argument("--num-chunks", type=int, default=1) 94 | parser.add_argument("--chunk-idx", type=int, default=0) 95 | parser.add_argument("--temperature", type=float, default=0.2) 96 | parser.add_argument("--top_p", type=float, default=None) 97 | parser.add_argument("--num_beams", type=int, default=1) 98 | args = parser.parse_args() 99 | 100 | eval_model(args) 101 | -------------------------------------------------------------------------------- /llava/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from llava.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True, encoding='UTF-8') 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /llava/eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/model_vqa-checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 36 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 37 | answers_file = os.path.expanduser(args.answers_file) 38 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 39 | ans_file = open(answers_file, "w") 40 | for line in tqdm(questions): 41 | idx = line["question_id"] 42 | image_file = line["image"] 43 | qs = line["text"] 44 | cur_prompt = qs 45 | if model.config.mm_use_im_start_end: 46 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 47 | else: 48 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 49 | 50 | conv = conv_templates[args.conv_mode].copy() 51 | conv.append_message(conv.roles[0], qs) 52 | conv.append_message(conv.roles[1], None) 53 | prompt = conv.get_prompt() 54 | 55 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 56 | 57 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 58 | image_tensor = process_images([image], image_processor, model.config)[0] 59 | 60 | with torch.inference_mode(): 61 | output_ids = model.generate( 62 | input_ids, 63 | images=image_tensor.unsqueeze(0).half().cuda(), 64 | image_sizes=[image.size], 65 | do_sample=True if args.temperature > 0 else False, 66 | temperature=args.temperature, 67 | top_p=args.top_p, 68 | num_beams=args.num_beams, 69 | # no_repeat_ngram_size=3, 70 | max_new_tokens=1024, 71 | use_cache=True) 72 | 73 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 74 | 75 | ans_id = shortuuid.uuid() 76 | ans_file.write(json.dumps({"question_id": idx, 77 | "prompt": cur_prompt, 78 | "text": outputs, 79 | "answer_id": ans_id, 80 | "model_id": model_name, 81 | "metadata": {}}) + "\n") 82 | ans_file.flush() 83 | ans_file.close() 84 | 85 | if __name__ == "__main__": 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 88 | parser.add_argument("--model-base", type=str, default=None) 89 | parser.add_argument("--image-folder", type=str, default="") 90 | parser.add_argument("--question-file", type=str, default="/root/autodl-tmp/workplace/LLaVA/playground/data/eval/mm-vet/llava-mm-vet.jsonl") 91 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 92 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 93 | parser.add_argument("--num-chunks", type=int, default=1) 94 | parser.add_argument("--chunk-idx", type=int, default=0) 95 | parser.add_argument("--temperature", type=float, default=0.2) 96 | parser.add_argument("--top_p", type=float, default=None) 97 | parser.add_argument("--num_beams", type=int, default=1) 98 | args = parser.parse_args() 99 | 100 | eval_model(args) 101 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /llava/eval/table/prompt.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for general questions"} 2 | {"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, "description": "Prompt for coding questions"} 3 | {"prompt_id": 3, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, "description": "Prompt for math questions"} 4 | {"prompt_id": 4, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Visual Context]\n{context}\n[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for visual questions"} 5 | -------------------------------------------------------------------------------- /llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | import warnings 3 | 4 | import torch 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv 8 | 9 | try: 10 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 11 | except ImportError: 12 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func 13 | from flash_attn.bert_padding import unpad_input, pad_input 14 | 15 | 16 | def forward( 17 | self, 18 | hidden_states: torch.Tensor, 19 | attention_mask: Optional[torch.Tensor] = None, 20 | position_ids: Optional[torch.Tensor] = None, 21 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 22 | output_attentions: bool = False, 23 | use_cache: bool = False, 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 25 | if output_attentions: 26 | warnings.warn( 27 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead." 28 | ) 29 | 30 | bsz, q_len, _ = hidden_states.size() 31 | 32 | query_states = ( 33 | self.q_proj(hidden_states) 34 | .view(bsz, q_len, self.num_heads, self.head_dim) 35 | .transpose(1, 2) 36 | ) 37 | key_states = ( 38 | self.k_proj(hidden_states) 39 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 40 | .transpose(1, 2) 41 | ) 42 | value_states = ( 43 | self.v_proj(hidden_states) 44 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 45 | .transpose(1, 2) 46 | ) # shape: (b, num_heads, s, head_dim) 47 | 48 | kv_seq_len = key_states.shape[-2] 49 | if past_key_value is not None: 50 | kv_seq_len += past_key_value[0].shape[-2] 51 | 52 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 53 | query_states, key_states = apply_rotary_pos_emb( 54 | query_states, key_states, cos, sin, position_ids 55 | ) 56 | 57 | if past_key_value is not None: 58 | # reuse k, v 59 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 60 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 61 | 62 | past_key_value = (key_states, value_states) if use_cache else None 63 | 64 | # repeat k/v heads if n_kv_heads < n_heads 65 | key_states = repeat_kv(key_states, self.num_key_value_groups) 66 | value_states = repeat_kv(value_states, self.num_key_value_groups) 67 | 68 | # Transform the data into the format required by flash attention 69 | qkv = torch.stack([query_states, key_states, value_states], dim=2) 70 | qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim] 71 | key_padding_mask = attention_mask 72 | 73 | if key_padding_mask is None: 74 | qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim) 75 | cu_q_lens = torch.arange( 76 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 77 | ) 78 | max_s = q_len 79 | output = flash_attn_unpadded_qkvpacked_func( 80 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 81 | ) 82 | output = output.view(bsz, q_len, -1) 83 | else: 84 | qkv = qkv.reshape(bsz, q_len, -1) 85 | qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask) 86 | qkv = qkv.view(-1, 3, self.num_heads, self.head_dim) 87 | output_unpad = flash_attn_unpadded_qkvpacked_func( 88 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 89 | ) 90 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) 91 | output = pad_input(output_unpad, indices, bsz, q_len) 92 | 93 | return self.o_proj(output), None, past_key_value 94 | 95 | 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 97 | # requires the attention mask to be the same as the key_padding_mask 98 | def _prepare_decoder_attention_mask( 99 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 100 | ): 101 | # [bsz, seq_len] 102 | return attention_mask 103 | 104 | 105 | def replace_llama_attn_with_flash_attn(): 106 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 107 | if cuda_major < 8: 108 | warnings.warn( 109 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." 110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 111 | ) 112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 113 | _prepare_decoder_attention_mask 114 | ) 115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 116 | -------------------------------------------------------------------------------- /llava/eval/model_vqa_science.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | 36 | questions = json.load(open(os.path.expanduser(args.question_file), "r")) 37 | #questions=questions[:10] 38 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 39 | answers_file = os.path.expanduser(args.answers_file) 40 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 41 | ans_file = open(answers_file, "w") 42 | for i, line in enumerate(tqdm(questions)): 43 | idx = line["id"] 44 | question = line['conversations'][0] 45 | qs = question['value'].replace('', '').strip() 46 | cur_prompt = qs 47 | 48 | if 'image' in line: 49 | image_file = line["image"] 50 | image = Image.open(os.path.join(args.image_folder, image_file)) 51 | image_tensor = process_images([image], image_processor, model.config)[0] 52 | images = image_tensor.unsqueeze(0).half().cuda() 53 | image_sizes = [image.size] 54 | if getattr(model.config, 'mm_use_im_start_end', False): 55 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 56 | else: 57 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 58 | cur_prompt = '' + '\n' + cur_prompt 59 | else: 60 | images = None 61 | image_sizes = None 62 | 63 | if args.single_pred_prompt: 64 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 65 | cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." 66 | 67 | conv = conv_templates[args.conv_mode].copy() 68 | conv.append_message(conv.roles[0], qs) 69 | conv.append_message(conv.roles[1], None) 70 | prompt = conv.get_prompt() 71 | 72 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 73 | 74 | with torch.inference_mode(): 75 | output_ids = model.generate( 76 | input_ids, 77 | images=images, 78 | image_sizes=image_sizes, 79 | do_sample=True if args.temperature > 0 else False, 80 | temperature=args.temperature, 81 | max_new_tokens=1024, 82 | use_cache=True, 83 | ) 84 | 85 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 86 | 87 | ans_id = shortuuid.uuid() 88 | ans_file.write(json.dumps({"question_id": idx, 89 | "prompt": cur_prompt, 90 | "text": outputs, 91 | "answer_id": ans_id, 92 | "model_id": model_name, 93 | "metadata": {}}) + "\n") 94 | ans_file.flush() 95 | ans_file.close() 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 100 | parser.add_argument("--model-base", type=str, default=None) 101 | parser.add_argument("--image-folder", type=str, default="") 102 | parser.add_argument("--question-file", type=str, default="tables/question.json") 103 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 104 | parser.add_argument("--conv-mode", type=str, default="llava_v0") 105 | parser.add_argument("--num-chunks", type=int, default=1) 106 | parser.add_argument("--chunk-idx", type=int, default=0) 107 | parser.add_argument("--temperature", type=float, default=0.2) 108 | parser.add_argument("--answer-prompter", action="store_true") 109 | parser.add_argument("--single-pred-prompt", action="store_true") 110 | args = parser.parse_args() 111 | 112 | eval_model(args) 113 | -------------------------------------------------------------------------------- /llava/eval/.ipynb_checkpoints/model_vqa_science-checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | 14 | from PIL import Image 15 | import math 16 | 17 | 18 | def split_list(lst, n): 19 | """Split a list into n (roughly) equal-sized chunks""" 20 | chunk_size = math.ceil(len(lst) / n) # integer division 21 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 22 | 23 | 24 | def get_chunk(lst, n, k): 25 | chunks = split_list(lst, n) 26 | return chunks[k] 27 | 28 | 29 | def eval_model(args): 30 | # Model 31 | disable_torch_init() 32 | model_path = os.path.expanduser(args.model_path) 33 | model_name = get_model_name_from_path(model_path) 34 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 35 | 36 | questions = json.load(open(os.path.expanduser(args.question_file), "r")) 37 | #questions=questions[:10] 38 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 39 | answers_file = os.path.expanduser(args.answers_file) 40 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 41 | ans_file = open(answers_file, "w") 42 | for i, line in enumerate(tqdm(questions)): 43 | idx = line["id"] 44 | question = line['conversations'][0] 45 | qs = question['value'].replace('', '').strip() 46 | cur_prompt = qs 47 | 48 | if 'image' in line: 49 | image_file = line["image"] 50 | image = Image.open(os.path.join(args.image_folder, image_file)) 51 | image_tensor = process_images([image], image_processor, model.config)[0] 52 | images = image_tensor.unsqueeze(0).half().cuda() 53 | image_sizes = [image.size] 54 | if getattr(model.config, 'mm_use_im_start_end', False): 55 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 56 | else: 57 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 58 | cur_prompt = '' + '\n' + cur_prompt 59 | else: 60 | images = None 61 | image_sizes = None 62 | 63 | if args.single_pred_prompt: 64 | qs = qs + '\n' + "Answer with the option's letter from the given choices directly." 65 | cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly." 66 | 67 | conv = conv_templates[args.conv_mode].copy() 68 | conv.append_message(conv.roles[0], qs) 69 | conv.append_message(conv.roles[1], None) 70 | prompt = conv.get_prompt() 71 | 72 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 73 | 74 | with torch.inference_mode(): 75 | output_ids = model.generate( 76 | input_ids, 77 | images=images, 78 | image_sizes=image_sizes, 79 | do_sample=True if args.temperature > 0 else False, 80 | temperature=args.temperature, 81 | max_new_tokens=1024, 82 | use_cache=True, 83 | ) 84 | 85 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 86 | 87 | ans_id = shortuuid.uuid() 88 | ans_file.write(json.dumps({"question_id": idx, 89 | "prompt": cur_prompt, 90 | "text": outputs, 91 | "answer_id": ans_id, 92 | "model_id": model_name, 93 | "metadata": {}}) + "\n") 94 | ans_file.flush() 95 | ans_file.close() 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 100 | parser.add_argument("--model-base", type=str, default=None) 101 | parser.add_argument("--image-folder", type=str, default="") 102 | parser.add_argument("--question-file", type=str, default="tables/question.json") 103 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 104 | parser.add_argument("--conv-mode", type=str, default="llava_v0") 105 | parser.add_argument("--num-chunks", type=int, default=1) 106 | parser.add_argument("--chunk-idx", type=int, default=0) 107 | parser.add_argument("--temperature", type=float, default=0.2) 108 | parser.add_argument("--answer-prompter", action="store_true") 109 | parser.add_argument("--single-pred-prompt", action="store_true") 110 | args = parser.parse_args() 111 | 112 | eval_model(args) 113 | -------------------------------------------------------------------------------- /llava/eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from llava.constants import ( 5 | IMAGE_TOKEN_INDEX, 6 | DEFAULT_IMAGE_TOKEN, 7 | DEFAULT_IM_START_TOKEN, 8 | DEFAULT_IM_END_TOKEN, 9 | IMAGE_PLACEHOLDER, 10 | ) 11 | from llava.conversation import conv_templates, SeparatorStyle 12 | from llava.model.builder import load_pretrained_model 13 | from llava.utils import disable_torch_init 14 | from llava.mm_utils import ( 15 | process_images, 16 | tokenizer_image_token, 17 | get_model_name_from_path, 18 | ) 19 | 20 | from PIL import Image 21 | 22 | import requests 23 | from PIL import Image 24 | from io import BytesIO 25 | import re 26 | 27 | 28 | def image_parser(args): 29 | out = args.image_file.split(args.sep) 30 | return out 31 | 32 | 33 | def load_image(image_file): 34 | if image_file.startswith("http") or image_file.startswith("https"): 35 | response = requests.get(image_file) 36 | image = Image.open(BytesIO(response.content)).convert("RGB") 37 | else: 38 | image = Image.open(image_file).convert("RGB") 39 | return image 40 | 41 | 42 | def load_images(image_files): 43 | out = [] 44 | for image_file in image_files: 45 | image = load_image(image_file) 46 | out.append(image) 47 | return out 48 | 49 | 50 | def eval_model(args): 51 | # Model 52 | disable_torch_init() 53 | 54 | model_name = get_model_name_from_path(args.model_path) 55 | tokenizer, model, image_processor, context_len = load_pretrained_model( 56 | args.model_path, args.model_base, model_name 57 | ) 58 | 59 | qs = args.query 60 | image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN 61 | if IMAGE_PLACEHOLDER in qs: 62 | if model.config.mm_use_im_start_end: 63 | qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) 64 | else: 65 | qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) 66 | else: 67 | if model.config.mm_use_im_start_end: 68 | qs = image_token_se + "\n" + qs 69 | else: 70 | qs = DEFAULT_IMAGE_TOKEN + "\n" + qs 71 | 72 | if "llama-2" in model_name.lower(): 73 | conv_mode = "llava_llama_2" 74 | elif "mistral" in model_name.lower(): 75 | conv_mode = "mistral_instruct" 76 | elif "v1.6-34b" in model_name.lower(): 77 | conv_mode = "chatml_direct" 78 | elif "v1" in model_name.lower(): 79 | conv_mode = "llava_v1" 80 | elif "mpt" in model_name.lower(): 81 | conv_mode = "mpt" 82 | else: 83 | conv_mode = "llava_v0" 84 | 85 | if args.conv_mode is not None and conv_mode != args.conv_mode: 86 | print( 87 | "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( 88 | conv_mode, args.conv_mode, args.conv_mode 89 | ) 90 | ) 91 | else: 92 | args.conv_mode = conv_mode 93 | 94 | conv = conv_templates[args.conv_mode].copy() 95 | conv.append_message(conv.roles[0], qs) 96 | conv.append_message(conv.roles[1], None) 97 | prompt = conv.get_prompt() 98 | 99 | image_files = image_parser(args) 100 | images = load_images(image_files) 101 | image_sizes = [x.size for x in images] 102 | images_tensor = process_images( 103 | images, 104 | image_processor, 105 | model.config 106 | ).to(model.device, dtype=torch.float16) 107 | 108 | input_ids = ( 109 | tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") 110 | .unsqueeze(0) 111 | .cuda() 112 | ) 113 | 114 | with torch.inference_mode(): 115 | output_ids = model.generate( 116 | input_ids, 117 | images=images_tensor, 118 | image_sizes=image_sizes, 119 | do_sample=True if args.temperature > 0 else False, 120 | temperature=args.temperature, 121 | top_p=args.top_p, 122 | num_beams=args.num_beams, 123 | max_new_tokens=args.max_new_tokens, 124 | use_cache=True, 125 | ) 126 | 127 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 128 | print(outputs) 129 | 130 | 131 | if __name__ == "__main__": 132 | parser = argparse.ArgumentParser() 133 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 134 | parser.add_argument("--model-base", type=str, default=None) 135 | parser.add_argument("--image-file", type=str, required=True) 136 | parser.add_argument("--query", type=str, required=True) 137 | parser.add_argument("--conv-mode", type=str, default=None) 138 | parser.add_argument("--sep", type=str, default=",") 139 | parser.add_argument("--temperature", type=float, default=0.2) 140 | parser.add_argument("--top_p", type=float, default=None) 141 | parser.add_argument("--num_beams", type=int, default=1) 142 | parser.add_argument("--max_new_tokens", type=int, default=512) 143 | args = parser.parse_args() 144 | 145 | eval_model(args) 146 | --------------------------------------------------------------------------------