├── 001.jpg ├── 002.jpg ├── 011.jpg ├── 022.jpg ├── 023.jpg ├── 024.jpg ├── LICENSE ├── README.md ├── cog.yaml ├── evaluation ├── mmbench_maya_full_ft.txt ├── mmbench_maya_full_ft.xlsx ├── mme_results_dump.txt ├── mme_results_toxicity_finetuned.txt ├── palo_log_8_lang_maya_gpt4-0613.log ├── palo_log_8_lang_maya_toxicity_free_gpt4-0613.log ├── pope_results.txt ├── pope_results_toxicity_finetuned.txt ├── vizwiz_results.txt ├── vizwiz_results_toxicity_finetuned.txt └── vqav2_maya_full_ft.json ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── claude_plane_test.jpeg │ ├── claude_plane_test_2.jpeg │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── maya │ │ ├── calculate_mme.py │ │ ├── convert_answer_to_mme.py │ │ ├── eval_utils.py │ │ └── load_mme.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_maya.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── model_vqa_vizwiz.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ ├── table │ │ ├── answer │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ ├── answer_bard.jsonl │ │ │ ├── answer_gpt35.jsonl │ │ │ ├── answer_llama-13b.jsonl │ │ │ └── answer_vicuna-13b.jsonl │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ ├── model.jsonl │ │ ├── prompt.jsonl │ │ ├── question.jsonl │ │ ├── results │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ ├── review │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ ├── reviewer.jsonl │ │ └── rule.json │ ├── talk2maya.py │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ └── llava_cohere.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ └── siglip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── playground ├── __init__.py ├── data │ ├── coco2014_val_gpt4_qa_30x3.jsonl │ ├── coco2014_val_qa_eval │ │ ├── qa90_gpt4_answer.jsonl │ │ └── qa90_questions.jsonl │ └── prompts │ │ ├── complex_reasoning │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ ├── 002_caps.txt │ │ ├── 002_conv.txt │ │ └── system_message.txt │ │ ├── conversation │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ └── system_message.txt │ │ └── detail_description │ │ ├── 000_caps.txt │ │ ├── 000_conv.txt │ │ ├── 001_caps.txt │ │ ├── 001_conv.txt │ │ ├── 002_caps.txt │ │ ├── 002_conv.txt │ │ └── system_message.txt ├── eval │ └── eval_utils.py └── finetuning │ ├── dataset_utils.py │ └── finetune_args.py ├── predict.py ├── pyproject.toml ├── requirements.sh └── scripts ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_seed_for_submission.py ├── convert_sqa_to_llava.py ├── convert_sqa_to_llava_base_prompt.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── extract_mm_projector.py ├── finetune.sh ├── finetune_full_schedule.sh ├── finetune_lora.sh ├── finetune_qlora.sh ├── finetune_sqa.sh ├── maya ├── eval │ ├── gqa.sh │ ├── load_mme.sh │ ├── mm-vet_evaluator.ipynb │ ├── mmbench.sh │ ├── mme.ipynb │ ├── mme.sh │ ├── mmvet.sh │ ├── mmvet_score_calc.ipynb │ ├── palo_multilingual_llava_bench.sh │ ├── pope.sh │ ├── process_prediction.py │ ├── sqa.sh │ ├── textvqa.sh │ ├── vizwiz.sh │ └── vqav2.sh ├── finetune_aya_siglip.sh ├── finetune_aya_siglip2.sh ├── pretrain_aya_siglip.sh └── pretrain_aya_siglip2.sh ├── merge_lora_weights.py ├── pretrain.sh ├── pretrain_xformers.sh ├── sqa_eval_batch.sh ├── sqa_eval_gather.sh ├── upload_pypi.sh ├── v1_5 ├── eval │ ├── eval_all_languages_deprecated.sh │ ├── gqa.sh │ ├── llavabench.sh │ ├── llavabench_palo_deprecated.sh │ ├── mmbench.sh │ ├── mmbench_cn.sh │ ├── mme.sh │ ├── mmvet.sh │ ├── pope.sh │ ├── qbench.sh │ ├── qbench_zh.sh │ ├── seed.sh │ ├── sqa.sh │ ├── textvqa.sh │ ├── vizwiz.sh │ └── vqav2.sh ├── finetune.sh ├── finetune_aya_siglip.sh ├── finetune_llava_siglip.sh ├── finetune_llava_siglip2.sh ├── finetune_lora.sh ├── finetune_task.sh ├── finetune_task_lora.sh ├── pretrain.sh ├── pretrain_aya.sh ├── pretrain_aya_siglip.sh ├── pretrain_llava_siglip.sh └── pretrain_llava_siglip2.sh ├── zero2.json ├── zero3.json └── zero3_offload.json /001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/001.jpg -------------------------------------------------------------------------------- /002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/002.jpg -------------------------------------------------------------------------------- /011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/011.jpg -------------------------------------------------------------------------------- /022.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/022.jpg -------------------------------------------------------------------------------- /023.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/023.jpg -------------------------------------------------------------------------------- /024.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/024.jpg -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /evaluation/mmbench_maya_full_ft.txt: -------------------------------------------------------------------------------- 1 | Accuracy: 71.10% 2 | Precision: 71.23% 3 | Recall: 71.10% 4 | F1 Score: 71.10% 5 | -------------------------------------------------------------------------------- /evaluation/mmbench_maya_full_ft.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/evaluation/mmbench_maya_full_ft.xlsx -------------------------------------------------------------------------------- /evaluation/mme_results_dump.txt: -------------------------------------------------------------------------------- 1 | WARNING: 1 ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval. 2 | -------------Summary of Missing Prompts------------- 3 | Category: code_reasoning, File: 0020.txt, Prompt: Is a c++ code shown in the picture? Please answer yes or no. 4 | 5 | 6 | 7 | ================= Perception ================= 8 | #-------existence-------# 9 | TP: 28 10 | FN: 2 11 | TN: 28 12 | FP: 2 13 | precision: 0.9333333333333333 14 | recall: 0.9333333333333333 15 | f1: 0.9333333333333333 16 | yes_ratio: 0.5 17 | other_num: 0 18 | acc: 0.9333333333333333 19 | acc_plus: 0.8666666666666667 20 | #-------count-------# 21 | TP: 28 22 | FN: 2 23 | TN: 15 24 | FP: 15 25 | precision: 0.6511627906976745 26 | recall: 0.9333333333333333 27 | f1: 0.767123287671233 28 | yes_ratio: 0.5 29 | other_num: 0 30 | acc: 0.7166666666666667 31 | acc_plus: 0.4666666666666667 32 | #-------position-------# 33 | TP: 27 34 | FN: 3 35 | TN: 18 36 | FP: 12 37 | precision: 0.6923076923076923 38 | recall: 0.9 39 | f1: 0.7826086956521738 40 | yes_ratio: 0.5 41 | other_num: 0 42 | acc: 0.75 43 | acc_plus: 0.5666666666666667 44 | #-------color-------# 45 | TP: 28 46 | FN: 2 47 | TN: 21 48 | FP: 9 49 | precision: 0.7567567567567568 50 | recall: 0.9333333333333333 51 | f1: 0.835820895522388 52 | yes_ratio: 0.5 53 | other_num: 0 54 | acc: 0.8166666666666667 55 | acc_plus: 0.6333333333333333 56 | #-------posters-------# 57 | TP: 90 58 | FN: 48 59 | TN: 128 60 | FP: 19 61 | precision: 0.8256880733944955 62 | recall: 0.6521739130434783 63 | f1: 0.728744939271255 64 | yes_ratio: 0.4842105263157895 65 | other_num: 9 66 | acc: 0.7414965986394558 67 | acc_plus: 0.5306122448979592 68 | #-------celebrity-------# 69 | TP: 150 70 | FN: 20 71 | TN: 72 72 | FP: 98 73 | precision: 0.6048387096774194 74 | recall: 0.8823529411764706 75 | f1: 0.7177033492822966 76 | yes_ratio: 0.5 77 | other_num: 0 78 | acc: 0.6529411764705882 79 | acc_plus: 0.3588235294117647 80 | #-------scene-------# 81 | TP: 161 82 | FN: 38 83 | TN: 179 84 | FP: 21 85 | precision: 0.8846153846153846 86 | recall: 0.8090452261306532 87 | f1: 0.8451443569553806 88 | yes_ratio: 0.49874686716791977 89 | other_num: 1 90 | acc: 0.85 91 | acc_plus: 0.725 92 | #-------landmark-------# 93 | TP: 126 94 | FN: 74 95 | TN: 176 96 | FP: 24 97 | precision: 0.84 98 | recall: 0.63 99 | f1: 0.72 100 | yes_ratio: 0.5 101 | other_num: 0 102 | acc: 0.755 103 | acc_plus: 0.535 104 | #-------artwork-------# 105 | TP: 157 106 | FN: 36 107 | TN: 95 108 | FP: 89 109 | precision: 0.6382113821138211 110 | recall: 0.8134715025906736 111 | f1: 0.7152619589977222 112 | yes_ratio: 0.5119363395225465 113 | other_num: 23 114 | acc: 0.63 115 | acc_plus: 0.35 116 | #-------OCR-------# 117 | TP: 19 118 | FN: 1 119 | TN: 4 120 | FP: 16 121 | precision: 0.5428571428571428 122 | recall: 0.95 123 | f1: 0.6909090909090908 124 | yes_ratio: 0.5 125 | other_num: 0 126 | acc: 0.575 127 | acc_plus: 0.2 128 | 129 | 130 | Total Score: 1265.3873549419768 131 | 132 | existence score: 180.0 133 | count score: 118.33333333333334 134 | position score: 131.66666666666666 135 | color score: 145.0 136 | posters score: 127.2108843537415 137 | celebrity score: 101.1764705882353 138 | scene score: 157.5 139 | landmark score: 129.0 140 | artwork score: 98.0 141 | OCR score: 77.5 142 | 143 | 144 | ================= Cognition ================= 145 | #-------commonsense_reasoning-------# 146 | TP: 53 147 | FN: 17 148 | TN: 53 149 | FP: 17 150 | precision: 0.7571428571428571 151 | recall: 0.7571428571428571 152 | f1: 0.7571428571428571 153 | yes_ratio: 0.5 154 | other_num: 0 155 | acc: 0.7571428571428571 156 | acc_plus: 0.5571428571428572 157 | #-------numerical_calculation-------# 158 | TP: 15 159 | FN: 5 160 | TN: 4 161 | FP: 16 162 | precision: 0.4838709677419355 163 | recall: 0.75 164 | f1: 0.5882352941176471 165 | yes_ratio: 0.5 166 | other_num: 0 167 | acc: 0.475 168 | acc_plus: 0.1 169 | #-------text_translation-------# 170 | TP: 3 171 | FN: 17 172 | TN: 18 173 | FP: 2 174 | precision: 0.6 175 | recall: 0.15 176 | f1: 0.24 177 | yes_ratio: 0.5 178 | other_num: 0 179 | acc: 0.525 180 | acc_plus: 0.1 181 | #-------code_reasoning-------# 182 | TP: 13 183 | FN: 7 184 | TN: 11 185 | FP: 9 186 | precision: 0.5909090909090909 187 | recall: 0.65 188 | f1: 0.6190476190476191 189 | yes_ratio: 0.5 190 | other_num: 0 191 | acc: 0.6 192 | acc_plus: 0.25 193 | 194 | 195 | Total Score: 336.42857142857144 196 | 197 | commonsense_reasoning score: 131.42857142857142 198 | numerical_calculation score: 57.5 199 | text_translation score: 62.5 200 | code_reasoning score: 85.0 201 | 202 | 203 | ==== Overall Metrics Across All Responses ==== 204 | TP: 898 205 | FN: 272 206 | TN: 822 207 | FP: 349 208 | precision: 0.7201283079390537 209 | recall: 0.7675213675213676 210 | f1: 0.7430699213901532 211 | yes_ratio: 0.4997864160615122 212 | other_num: 33 213 | acc: 0.7245155855096883 -------------------------------------------------------------------------------- /evaluation/mme_results_toxicity_finetuned.txt: -------------------------------------------------------------------------------- 1 | WARNING: 1 ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval. 2 | -------------Summary of Missing Prompts------------- 3 | Category: code_reasoning, File: 0020.txt, Prompt: Is a c++ code shown in the picture? Please answer yes or no. 4 | 5 | 6 | 7 | ================= Perception ================= 8 | #-------existence-------# 9 | TP: 29 10 | FN: 1 11 | TN: 30 12 | FP: 0 13 | precision: 1.0 14 | recall: 0.9666666666666667 15 | f1: 0.983050847457627 16 | yes_ratio: 0.5 17 | other_num: 0 18 | acc: 0.9833333333333333 19 | acc_plus: 0.9666666666666667 20 | #-------count-------# 21 | TP: 26 22 | FN: 4 23 | TN: 20 24 | FP: 10 25 | precision: 0.7222222222222222 26 | recall: 0.8666666666666667 27 | f1: 0.7878787878787877 28 | yes_ratio: 0.5 29 | other_num: 0 30 | acc: 0.7666666666666667 31 | acc_plus: 0.5666666666666667 32 | #-------position-------# 33 | TP: 27 34 | FN: 3 35 | TN: 11 36 | FP: 19 37 | precision: 0.5869565217391305 38 | recall: 0.9 39 | f1: 0.7105263157894737 40 | yes_ratio: 0.5 41 | other_num: 0 42 | acc: 0.6333333333333333 43 | acc_plus: 0.36666666666666664 44 | #-------color-------# 45 | TP: 30 46 | FN: 0 47 | TN: 22 48 | FP: 8 49 | precision: 0.7894736842105263 50 | recall: 1.0 51 | f1: 0.8823529411764706 52 | yes_ratio: 0.5 53 | other_num: 0 54 | acc: 0.8666666666666667 55 | acc_plus: 0.7333333333333333 56 | #-------posters-------# 57 | TP: 88 58 | FN: 58 59 | TN: 132 60 | FP: 14 61 | precision: 0.8627450980392157 62 | recall: 0.6027397260273972 63 | f1: 0.7096774193548387 64 | yes_ratio: 0.5 65 | other_num: 2 66 | acc: 0.7482993197278912 67 | acc_plus: 0.54421768707483 68 | #-------celebrity-------# 69 | TP: 124 70 | FN: 46 71 | TN: 111 72 | FP: 59 73 | precision: 0.6775956284153005 74 | recall: 0.7294117647058823 75 | f1: 0.7025495750708215 76 | yes_ratio: 0.5 77 | other_num: 0 78 | acc: 0.6911764705882353 79 | acc_plus: 0.4588235294117647 80 | #-------scene-------# 81 | TP: 164 82 | FN: 36 83 | TN: 183 84 | FP: 17 85 | precision: 0.9060773480662984 86 | recall: 0.82 87 | f1: 0.8608923884514436 88 | yes_ratio: 0.5 89 | other_num: 0 90 | acc: 0.8675 91 | acc_plus: 0.755 92 | #-------landmark-------# 93 | TP: 61 94 | FN: 139 95 | TN: 199 96 | FP: 1 97 | precision: 0.9838709677419355 98 | recall: 0.305 99 | f1: 0.46564885496183206 100 | yes_ratio: 0.5 101 | other_num: 0 102 | acc: 0.65 103 | acc_plus: 0.3 104 | #-------artwork-------# 105 | TP: 141 106 | FN: 51 107 | TN: 121 108 | FP: 65 109 | precision: 0.6844660194174758 110 | recall: 0.734375 111 | f1: 0.7085427135678393 112 | yes_ratio: 0.5079365079365079 113 | other_num: 22 114 | acc: 0.655 115 | acc_plus: 0.4 116 | #-------OCR-------# 117 | TP: 19 118 | FN: 1 119 | TN: 8 120 | FP: 12 121 | precision: 0.6129032258064516 122 | recall: 0.95 123 | f1: 0.7450980392156863 124 | yes_ratio: 0.5 125 | other_num: 0 126 | acc: 0.675 127 | acc_plus: 0.4 128 | 129 | 130 | Total Score: 1302.8350340136055 131 | 132 | existence score: 195.0 133 | count score: 133.33333333333334 134 | position score: 100.0 135 | color score: 160.0 136 | posters score: 129.2517006802721 137 | celebrity score: 115.0 138 | scene score: 162.25 139 | landmark score: 95.0 140 | artwork score: 105.5 141 | OCR score: 107.5 142 | 143 | 144 | ================= Cognition ================= 145 | #-------commonsense_reasoning-------# 146 | TP: 54 147 | FN: 16 148 | TN: 42 149 | FP: 28 150 | precision: 0.6585365853658537 151 | recall: 0.7714285714285715 152 | f1: 0.7105263157894737 153 | yes_ratio: 0.5 154 | other_num: 0 155 | acc: 0.6857142857142857 156 | acc_plus: 0.44285714285714284 157 | #-------numerical_calculation-------# 158 | TP: 15 159 | FN: 5 160 | TN: 5 161 | FP: 15 162 | precision: 0.5 163 | recall: 0.75 164 | f1: 0.6 165 | yes_ratio: 0.5 166 | other_num: 0 167 | acc: 0.5 168 | acc_plus: 0.15 169 | #-------text_translation-------# 170 | TP: 5 171 | FN: 15 172 | TN: 17 173 | FP: 3 174 | precision: 0.625 175 | recall: 0.25 176 | f1: 0.35714285714285715 177 | yes_ratio: 0.5 178 | other_num: 0 179 | acc: 0.55 180 | acc_plus: 0.2 181 | #-------code_reasoning-------# 182 | TP: 13 183 | FN: 7 184 | TN: 10 185 | FP: 10 186 | precision: 0.5652173913043478 187 | recall: 0.65 188 | f1: 0.6046511627906976 189 | yes_ratio: 0.5 190 | other_num: 0 191 | acc: 0.575 192 | acc_plus: 0.2 193 | 194 | 195 | Total Score: 330.3571428571429 196 | 197 | commonsense_reasoning score: 112.85714285714286 198 | numerical_calculation score: 65.0 199 | text_translation score: 75.0 200 | code_reasoning score: 77.5 201 | 202 | 203 | ==== Overall Metrics Across All Responses ==== 204 | TP: 796 205 | FN: 382 206 | TN: 911 207 | FP: 261 208 | precision: 0.7530747398297067 209 | recall: 0.6757215619694398 210 | f1: 0.712304250559284 211 | yes_ratio: 0.5012765957446809 212 | other_num: 24 213 | acc: 0.719039595619208 -------------------------------------------------------------------------------- /evaluation/pope_results.txt: -------------------------------------------------------------------------------- 1 | Category: adversarial, # samples: 3000 2 | TP FP TN FN 3 | 1149 220 1280 351 4 | Accuracy: 0.8096666666666666 5 | Precision: 0.8392987582176772 6 | Recall: 0.766 7 | F1 score: 0.8009759498082956 8 | Yes ratio: 0.4563333333333333 9 | 0.801, 0.810, 0.839, 0.766, 0.456 10 | ==================================== 11 | Category: popular, # samples: 3000 12 | TP FP TN FN 13 | 1149 125 1375 351 14 | Accuracy: 0.8413333333333334 15 | Precision: 0.901883830455259 16 | Recall: 0.766 17 | F1 score: 0.8284066330209084 18 | Yes ratio: 0.4246666666666667 19 | 0.828, 0.841, 0.902, 0.766, 0.425 20 | ==================================== 21 | Category: random, # samples: 2910 22 | TP FP TN FN 23 | 1149 78 1332 351 24 | Accuracy: 0.8525773195876288 25 | Precision: 0.9364303178484108 26 | Recall: 0.766 27 | F1 score: 0.8426842684268426 28 | Yes ratio: 0.4216494845360825 29 | 0.843, 0.853, 0.936, 0.766, 0.422 30 | ==================================== -------------------------------------------------------------------------------- /evaluation/pope_results_toxicity_finetuned.txt: -------------------------------------------------------------------------------- 1 | Category: adversarial, # samples: 3000 2 | TP FP TN FN 3 | 1177 225 1275 323 4 | Accuracy: 0.8173333333333334 5 | Precision: 0.8395149786019972 6 | Recall: 0.7846666666666666 7 | F1 score: 0.8111647139903514 8 | Yes ratio: 0.4673333333333333 9 | 0.811, 0.817, 0.840, 0.785, 0.467 10 | ==================================== 11 | Category: popular, # samples: 3000 12 | TP FP TN FN 13 | 1177 114 1386 323 14 | Accuracy: 0.8543333333333333 15 | Precision: 0.9116963594113091 16 | Recall: 0.7846666666666666 17 | F1 score: 0.8434252955929774 18 | Yes ratio: 0.43033333333333335 19 | 0.843, 0.854, 0.912, 0.785, 0.430 20 | ==================================== 21 | Category: random, # samples: 2910 22 | TP FP TN FN 23 | 1177 79 1331 323 24 | Accuracy: 0.8618556701030928 25 | Precision: 0.9371019108280255 26 | Recall: 0.7846666666666666 27 | F1 score: 0.8541364296081279 28 | Yes ratio: 0.4316151202749141 29 | 0.854, 0.862, 0.937, 0.785, 0.432 30 | ==================================== -------------------------------------------------------------------------------- /evaluation/vizwiz_results.txt: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test": { 4 | "overall": 34.92, 5 | "other": 34.03, 6 | "unanswerable": 30.88, 7 | "yes/no": 77.02, 8 | "number": 24.63 9 | } 10 | } 11 | ] -------------------------------------------------------------------------------- /evaluation/vizwiz_results_toxicity_finetuned.txt: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "test": { 4 | "overall": 34.98, 5 | "other": 33.66, 6 | "unanswerable": 32.03, 7 | "yes/no": 77.08, 8 | "number": 24.72 9 | } 10 | } 11 | ] -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig 3 | except: 4 | pass 5 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /llava/eval/claude_plane_test.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/claude_plane_test.jpeg -------------------------------------------------------------------------------- /llava/eval/claude_plane_test_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/claude_plane_test_2.jpeg -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4o-mini', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | print('rate limit error!') 31 | pass 32 | except Exception as e: 33 | print(e) 34 | time.sleep(NUM_SECONDS_TO_SLEEP) 35 | 36 | print('success!') 37 | return response['choices'][0]['message']['content'] 38 | 39 | 40 | def parse_score(review): 41 | try: 42 | score_pair = review.split('\n')[0] 43 | score_pair = score_pair.replace(',', ' ') 44 | sp = score_pair.split(' ') 45 | if len(sp) == 2: 46 | return [float(sp[0]), float(sp[1])] 47 | else: 48 | print('error', review) 49 | return [-1, -1] 50 | except Exception as e: 51 | print(e) 52 | print('error', review) 53 | return [-1, -1] 54 | 55 | 56 | if __name__ == '__main__': 57 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 58 | parser.add_argument('-q', '--question') 59 | # parser.add_argument('-a', '--answer') 60 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 61 | parser.add_argument('-r', '--rule') 62 | parser.add_argument('-o', '--output') 63 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 64 | args = parser.parse_args() 65 | 66 | ray.init() 67 | 68 | f_q = open(os.path.expanduser(args.question)) 69 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 70 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 71 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 72 | 73 | review_file = open(f'{args.output}', 'w') 74 | 75 | js_list = [] 76 | handles = [] 77 | idx = 0 78 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 79 | # if idx == 1: 80 | # break 81 | 82 | ques = json.loads(ques_js) 83 | ans1 = json.loads(ans1_js) 84 | ans2 = json.loads(ans2_js) 85 | 86 | category = json.loads(ques_js)['category'] 87 | if category in rule_dict: 88 | rule = rule_dict[category] 89 | else: 90 | rule = rule_dict['default'] 91 | prompt = rule['prompt'] 92 | role = rule['role'] 93 | content = (f'[Question]\n{ques["text"]}\n\n' 94 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 95 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 96 | f'[System]\n{prompt}\n\n') 97 | js_list.append({ 98 | 'id': idx+1, 99 | 'question_id': ques['question_id'], 100 | 'answer1_id': ans1['answer_id'], 101 | 'answer2_id': ans2['answer_id'], 102 | 'category': category}) 103 | idx += 1 104 | handles.append(get_eval.remote(content, args.max_tokens)) 105 | # To avoid the rate limit set by OpenAI 106 | time.sleep(NUM_SECONDS_TO_SLEEP) 107 | 108 | reviews = ray.get(handles) 109 | for idx, review in enumerate(reviews): 110 | scores = parse_score(review) 111 | js_list[idx]['content'] = review 112 | js_list[idx]['tuple'] = scores 113 | review_file.write(json.dumps(js_list[idx]) + '\n') 114 | review_file.close() 115 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | #model='gpt-4o-mini', #gpt-4-0314 16 | model='gpt-4-0613', 17 | messages=[{ 18 | 'role': 'system', 19 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 20 | }, { 21 | 'role': 'user', 22 | 'content': content, 23 | }], 24 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 25 | max_tokens=max_tokens, 26 | ) 27 | break 28 | except openai.error.RateLimitError: 29 | print('rate limit error!') 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | return response['choices'][0]['message']['content'] 36 | 37 | 38 | def parse_score(review): 39 | try: 40 | score_pair = review.split('\n')[0] 41 | score_pair = score_pair.replace(',', ' ') 42 | sp = score_pair.split(' ') 43 | if len(sp) == 2: 44 | return [float(sp[0]), float(sp[1])] 45 | else: 46 | print('error', review) 47 | print(f"Warning: Unexpected score format. Original format: {sp}") 48 | corrected_sp = sp[:2] 49 | try: 50 | print(f"Attempting to use: {corrected_sp}") 51 | return [float(corrected_sp[0]), float(corrected_sp[1])] 52 | except (ValueError, TypeError): 53 | print(f"Unable to parse scores. Returning default [-1, -1]") 54 | return [-1, -1] 55 | except Exception as e: 56 | print(e) 57 | print('error', review) 58 | return [-1, -1] 59 | 60 | 61 | if __name__ == '__main__': 62 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 63 | parser.add_argument('-q', '--question') 64 | parser.add_argument('-c', '--context') 65 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 66 | parser.add_argument('-r', '--rule') 67 | parser.add_argument('-o', '--output') 68 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 69 | args = parser.parse_args() 70 | 71 | f_q = open(os.path.expanduser(args.question)) 72 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 73 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 74 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 75 | 76 | if os.path.isfile(os.path.expanduser(args.output)): 77 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 78 | else: 79 | cur_reviews = [] 80 | 81 | review_file = open(f'{args.output}', 'a') 82 | 83 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 84 | image_to_context = {context['image']: context for context in context_list} 85 | 86 | handles = [] 87 | idx = 0 88 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 89 | ques = json.loads(ques_js) 90 | ans1 = json.loads(ans1_js) 91 | ans2 = json.loads(ans2_js) 92 | 93 | inst = image_to_context[ques['image']] 94 | 95 | if isinstance(inst['caption'], list): 96 | cap_str = '\n'.join(inst['caption']) 97 | else: 98 | cap_str = inst['caption'] 99 | 100 | category = 'llava_bench_' + json.loads(ques_js)['category'] 101 | if category in rule_dict: 102 | rule = rule_dict[category] 103 | else: 104 | assert False, f"Visual QA category not found in rule file: {category}." 105 | prompt = rule['prompt'] 106 | role = rule['role'] 107 | content = (f'[Context]\n{cap_str}\n\n' 108 | f'[Question]\n{ques["text"]}\n\n' 109 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 110 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 111 | f'[System]\n{prompt}\n\n') 112 | cur_js = { 113 | 'id': idx+1, 114 | 'question_id': ques['question_id'], 115 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 116 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 117 | 'category': category 118 | } 119 | if idx >= len(cur_reviews): 120 | review = get_eval(content, args.max_tokens) 121 | scores = parse_score(review) 122 | cur_js['content'] = review 123 | cur_js['tuple'] = scores 124 | review_file.write(json.dumps(cur_js) + '\n') 125 | review_file.flush() 126 | else: 127 | print(f'Skipping {idx} as we already have it.') 128 | idx += 1 129 | print(idx) 130 | review_file.close() 131 | -------------------------------------------------------------------------------- /llava/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4o-mini', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | print('rate limit error!') 29 | pass 30 | except Exception as e: 31 | print(e) 32 | time.sleep(NUM_SECONDS_TO_SLEEP) 33 | 34 | return response['choices'][0]['message']['content'] 35 | 36 | 37 | def parse_score(review): 38 | try: 39 | score_pair = review.split('\n')[0] 40 | score_pair = score_pair.replace(',', ' ') 41 | sp = score_pair.split(' ') 42 | if len(sp) == 2: 43 | return [float(sp[0]), float(sp[1])] 44 | else: 45 | print('error', review) 46 | return [-1, -1] 47 | except Exception as e: 48 | print(e) 49 | print('error', review) 50 | return [-1, -1] 51 | 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 55 | parser.add_argument('-q', '--question') 56 | parser.add_argument('-c', '--context') 57 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 58 | parser.add_argument('-r', '--rule') 59 | parser.add_argument('-o', '--output') 60 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 61 | args = parser.parse_args() 62 | 63 | f_q = open(os.path.expanduser(args.question)) 64 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 65 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 66 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 67 | 68 | if os.path.isfile(os.path.expanduser(args.output)): 69 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 70 | else: 71 | cur_reviews = [] 72 | 73 | review_file = open(f'{args.output}', 'a') 74 | 75 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 76 | image_to_context = {context['image']: context for context in context_list} 77 | 78 | handles = [] 79 | idx = 0 80 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | inst = image_to_context[ques['image']] 86 | cap_str = '\n'.join(inst['captions']) 87 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 88 | 89 | category = json.loads(ques_js)['category'] 90 | if category in rule_dict: 91 | rule = rule_dict[category] 92 | else: 93 | assert False, f"Visual QA category not found in rule file: {category}." 94 | prompt = rule['prompt'] 95 | role = rule['role'] 96 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 97 | f'[Question]\n{ques["text"]}\n\n' 98 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 99 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 100 | f'[System]\n{prompt}\n\n') 101 | cur_js = { 102 | 'id': idx+1, 103 | 'question_id': ques['question_id'], 104 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 105 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 106 | 'category': category 107 | } 108 | if idx >= len(cur_reviews): 109 | review = get_eval(content, args.max_tokens) 110 | scores = parse_score(review) 111 | cur_js['content'] = review 112 | cur_js['tuple'] = scores 113 | review_file.write(json.dumps(cur_js) + '\n') 114 | review_file.flush() 115 | else: 116 | print(f'Skipping {idx} as we already have it.') 117 | idx += 1 118 | print(idx) 119 | review_file.close() 120 | -------------------------------------------------------------------------------- /llava/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /llava/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /llava/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /llava/eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /llava/eval/maya/convert_answer_to_mme.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from collections import defaultdict 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | 9 | parser.add_argument('--experiment', 10 | type=str, 11 | required=True) 12 | 13 | args = parser.parse_args() 14 | return args 15 | 16 | def get_gt(data_path): 17 | GT = {} 18 | 19 | for category in os.listdir(data_path): 20 | category_dir = os.path.join(data_path, category) 21 | if not os.path.isdir(category_dir): 22 | continue 23 | if os.path.exists(os.path.join(category_dir, 'images')): 24 | image_path = os.path.join(category_dir, 'images') 25 | else: 26 | image_path = category_dir 27 | qa_path = os.path.join(category_dir, 'questions_answers_YN') 28 | assert os.path.isdir(image_path), image_path 29 | assert os.path.isdir(qa_path), qa_path 30 | for file in os.listdir(qa_path): 31 | if not file.endswith('.txt'): 32 | continue 33 | for line in open(os.path.join(qa_path, file)): 34 | question, answer = line.strip().split('\t') 35 | GT[(category, file, question)] = answer 36 | return GT 37 | 38 | if __name__ == "__main__": 39 | 40 | args = get_args() 41 | 42 | GT = get_gt( 43 | data_path='MME_Benchmark_release_version' 44 | ) 45 | 46 | experiment = args.experiment 47 | 48 | result_dir = os.path.join('eval_tool', 'answers', experiment) 49 | os.makedirs(result_dir, exist_ok=True) 50 | 51 | answers = [json.loads(line) for line in open(os.path.join('answers', f'{experiment}.jsonl'))] 52 | 53 | results = defaultdict(list) 54 | for answer in answers: 55 | category = answer['question_id'].split('/')[0] 56 | file = answer['question_id'].split('/')[-1].split('.')[0] + '.txt' 57 | question = answer['prompt'] 58 | results[category].append((file, answer['prompt'], answer['text'])) 59 | 60 | unmatched_prompts = [] 61 | for category, cate_tups in results.items(): 62 | with open(os.path.join(result_dir, f'{category}.txt'), 'w') as fp: 63 | for file, prompt, answer in cate_tups: 64 | original_prompt = prompt # Keep original prompt for logging 65 | if 'Answer the question using a single word or phrase.' in prompt: 66 | prompt = prompt.replace('Answer the question using a single word or phrase.', '').strip() 67 | if 'Please answer yes or no.' not in prompt: 68 | prompt = prompt + ' Please answer yes or no.' 69 | if (category, file, prompt) not in GT: 70 | prompt = prompt.replace(' Please answer yes or no.', ' Please answer yes or no.') 71 | try: 72 | gt_ans = GT[(category, file, prompt)] 73 | except KeyError: 74 | # print(f"Ground truth not found for Category: {category}, File: {file}, Prompt: {prompt}") 75 | gt_ans = "Yes" # Defaults to yes 76 | unmatched_prompts.append((category, file, prompt)) 77 | tup = file, prompt, gt_ans, answer 78 | fp.write('\t'.join(tup) + '\n') 79 | 80 | if unmatched_prompts: 81 | print(f"WARNING: {len(unmatched_prompts)} ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval.") 82 | print("-------------Summary of Missing Prompts-------------") 83 | for category, file, prompt in unmatched_prompts: 84 | print(f"Category: {category}, File: {file}, Prompt: {prompt}") 85 | print('\n\n') 86 | -------------------------------------------------------------------------------- /llava/eval/maya/load_mme.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | This script loads the MME dataset from Hugging Face, saves images and Y/N answers to the specified folder structure. 5 | """ 6 | 7 | import os 8 | import json 9 | import pandas as pd 10 | from datasets import load_dataset 11 | from PIL import Image 12 | 13 | def main(): 14 | # Define paths 15 | questions_file = "./playground/data/eval/MME/llava_mme.jsonl" 16 | answers_file = "./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl" 17 | images_folder = "./playground/data/eval/MME/MME_Benchmark_release_version" 18 | 19 | # Load the MME dataset from disk 20 | mme_dataset = load_dataset("lmms-lab/MME") 21 | print("Dataset loaded successfully.\n") 22 | 23 | # Ensure base image save directory exists 24 | image_save_base_path = "./playground/data/eval/MME/MME_Benchmark_release_version" 25 | os.makedirs(image_save_base_path, exist_ok=True) 26 | 27 | # Categories that require an extra subdirectory for Y/N answers and images 28 | special_categories = ["artwork", "celebrity", "landmark", "scene", "posters"] 29 | 30 | # Save Y/N Answers to Folder 31 | def save_answer(example): 32 | question_id = example['question_id'] # Format: category/image.png 33 | question = example['question'] 34 | answer = example['answer'] 35 | 36 | category = question_id.split('/')[0] 37 | image_name = os.path.basename(question_id) 38 | image_basename = os.path.splitext(image_name)[0] 39 | 40 | # Special case handling – one entry on HuggingFace is different than the LLaVA repo 41 | if category == 'code_reasoning' and image_basename == '0020': 42 | question = "Is a python code shown in the picture? Please answer yes or no." 43 | 44 | # Determine the save directory 45 | if category in special_categories: 46 | full_save_dir = os.path.join(image_save_base_path, category, "questions_answers_YN") 47 | else: 48 | full_save_dir = os.path.join(image_save_base_path, category, "questions_answers_YN") 49 | 50 | # Create the directory if it doesn't exist 51 | os.makedirs(full_save_dir, exist_ok=True) 52 | 53 | # Define the full path for the answer file 54 | answer_save_path = os.path.join(full_save_dir, f"{image_basename}.txt") 55 | 56 | # Append the question and answer to the text file 57 | with open(answer_save_path, 'a') as answer_file: 58 | answer_file.write(f"{question}\t{answer}\n") 59 | 60 | return {'answer_save_path': answer_save_path} 61 | 62 | # Save Images to Folder 63 | def save_images(example): 64 | image = example['image'] # Assuming 'image' is a PIL Image object 65 | image_subdir = example['question_id'] # category/image.png 66 | category = image_subdir.split('/')[0] # extract the category (first part of question_id) 67 | file_name = os.path.basename(image_subdir) # extract the file name (including extension) 68 | 69 | # Get the image extension (e.g., ".png", ".jpg") 70 | _, extension = os.path.splitext(file_name) 71 | 72 | # Check if the category is one of the special ones that needs an extra 'images' folder 73 | if category in special_categories: 74 | # Save in the category/images/ structure (e.g., artwork/images/16006.jpg) 75 | full_save_dir = os.path.join(image_save_base_path, category, "images") 76 | else: 77 | # Save in the usual category/image.png structure (e.g., code_reasoning/0012.png) 78 | full_save_dir = os.path.join(image_save_base_path, category) 79 | 80 | # Create the subdirectory if it doesn't exist 81 | os.makedirs(full_save_dir, exist_ok=True) 82 | 83 | # Create the full image save path 84 | image_save_path = os.path.join(full_save_dir, file_name) 85 | 86 | # Save the image to the specified path in the correct format 87 | if extension.lower() == ".jpg" or extension.lower() == ".jpeg": 88 | image.save(image_save_path, format="JPEG") 89 | elif extension.lower() == ".png": 90 | image.save(image_save_path, format="PNG") 91 | else: 92 | # Handle other formats or default to PNG (though this shouldn't happen!) 93 | image.save(image_save_path, format="PNG") 94 | 95 | return {'image_save_path': image_save_path} 96 | 97 | # Apply the save_answer and save_images functions to the 'test' split of the dataset 98 | print("Saving Y/N answers to disk...") 99 | _ = mme_dataset['test'].map(save_answer) 100 | print("Answers sucessfully saved.") 101 | print("Saving images to disk. This might slow down around the 2000 image mark; be patient, it will speed up ") 102 | _ = mme_dataset['test'].map(save_images) 103 | print("Images sucessfully saved.") 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /llava/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from llava.conversation import default_conversation 10 | from llava.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) 65 | -------------------------------------------------------------------------------- /llava/eval/model_vqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import json 5 | from tqdm import tqdm 6 | import shortuuid 7 | 8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 9 | from llava.conversation import conv_templates, SeparatorStyle 10 | from llava.model.builder import load_pretrained_model 11 | from llava.utils import disable_torch_init 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path 13 | from llava.eval.maya.eval_utils import load_maya_model 14 | 15 | from PIL import Image 16 | import math 17 | 18 | 19 | def split_list(lst, n): 20 | """Split a list into n (roughly) equal-sized chunks""" 21 | chunk_size = math.ceil(len(lst) / n) # integer division 22 | return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] 23 | 24 | 25 | def get_chunk(lst, n, k): 26 | chunks = split_list(lst, n) 27 | return chunks[k] 28 | 29 | 30 | def eval_model(args): 31 | # Model 32 | disable_torch_init() 33 | model_path = os.path.expanduser(args.model_path) 34 | model_name = get_model_name_from_path(model_path) 35 | if 'maya' not in model_name: 36 | tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) 37 | else: 38 | model, tokenizer, image_processor, context_len = load_maya_model(args.model_base, model_path, mode = args.mode) 39 | 40 | questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] 41 | questions = get_chunk(questions, args.num_chunks, args.chunk_idx) 42 | answers_file = os.path.expanduser(args.answers_file) 43 | os.makedirs(os.path.dirname(answers_file), exist_ok=True) 44 | ans_file = open(answers_file, "w") 45 | for line in tqdm(questions): 46 | idx = line["question_id"] 47 | image_file = line["image"] 48 | qs = line["text"] 49 | cur_prompt = qs 50 | if model.config.mm_use_im_start_end: 51 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 52 | else: 53 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 54 | 55 | conv = conv_templates[args.conv_mode].copy() 56 | conv.append_message(conv.roles[0], qs) 57 | conv.append_message(conv.roles[1], None) 58 | prompt = conv.get_prompt() 59 | 60 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 61 | 62 | image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') 63 | image_tensor = process_images([image], image_processor, model.config)[0] 64 | 65 | with torch.inference_mode(): 66 | output_ids = model.generate( 67 | input_ids, 68 | images=image_tensor.unsqueeze(0).half().cuda(), 69 | image_sizes=[image.size], 70 | do_sample=True if args.temperature > 0 else False, 71 | temperature=args.temperature, 72 | top_p=args.top_p, 73 | num_beams=args.num_beams, 74 | # no_repeat_ngram_size=3, 75 | max_new_tokens=1024, 76 | use_cache=True) 77 | 78 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 79 | 80 | ans_id = shortuuid.uuid() 81 | ans_file.write(json.dumps({"question_id": idx, 82 | "prompt": cur_prompt, 83 | "text": outputs, 84 | "answer_id": ans_id, 85 | "model_id": model_name, 86 | "metadata": {}}) + "\n") 87 | ans_file.flush() 88 | ans_file.close() 89 | 90 | if __name__ == "__main__": 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument("--model-path", type=str, default="nahidalam/maya_full_ft") 93 | parser.add_argument("--model-base", type=str, default="CohereForAI/aya-23-8B") 94 | parser.add_argument("--mode", type=str, default="finetuned") 95 | parser.add_argument("--image-folder", type=str, default="") 96 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 97 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 98 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 99 | parser.add_argument("--num-chunks", type=int, default=1) 100 | parser.add_argument("--chunk-idx", type=int, default=0) 101 | parser.add_argument("--temperature", type=float, default=0.2) 102 | parser.add_argument("--top_p", type=float, default=None) 103 | parser.add_argument("--num_beams", type=int, default=1) 104 | args = parser.parse_args() 105 | 106 | eval_model(args) 107 | -------------------------------------------------------------------------------- /llava/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /llava/eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from llava.constants import ( 5 | IMAGE_TOKEN_INDEX, 6 | DEFAULT_IMAGE_TOKEN, 7 | DEFAULT_IM_START_TOKEN, 8 | DEFAULT_IM_END_TOKEN, 9 | IMAGE_PLACEHOLDER, 10 | ) 11 | from llava.conversation import conv_templates, SeparatorStyle 12 | from llava.model.builder import load_pretrained_model 13 | from llava.utils import disable_torch_init 14 | from llava.mm_utils import ( 15 | process_images, 16 | tokenizer_image_token, 17 | get_model_name_from_path, 18 | ) 19 | 20 | from PIL import Image 21 | 22 | import requests 23 | from PIL import Image 24 | from io import BytesIO 25 | import re 26 | 27 | 28 | def image_parser(args): 29 | out = args.image_file.split(args.sep) 30 | return out 31 | 32 | 33 | def load_image(image_file): 34 | if image_file.startswith("http") or image_file.startswith("https"): 35 | response = requests.get(image_file) 36 | image = Image.open(BytesIO(response.content)).convert("RGB") 37 | else: 38 | image = Image.open(image_file).convert("RGB") 39 | return image 40 | 41 | 42 | def load_images(image_files): 43 | out = [] 44 | for image_file in image_files: 45 | image = load_image(image_file) 46 | out.append(image) 47 | return out 48 | 49 | 50 | def eval_model(args): 51 | # Model 52 | disable_torch_init() 53 | 54 | model_name = get_model_name_from_path(args.model_path) 55 | tokenizer, model, image_processor, context_len = load_pretrained_model( 56 | args.model_path, args.model_base, model_name 57 | ) 58 | 59 | qs = args.query 60 | image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN 61 | if IMAGE_PLACEHOLDER in qs: 62 | if model.config.mm_use_im_start_end: 63 | qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) 64 | else: 65 | qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) 66 | else: 67 | if model.config.mm_use_im_start_end: 68 | qs = image_token_se + "\n" + qs 69 | else: 70 | qs = DEFAULT_IMAGE_TOKEN + "\n" + qs 71 | 72 | if "llama-2" in model_name.lower(): 73 | conv_mode = "llava_llama_2" 74 | elif "mistral" in model_name.lower(): 75 | conv_mode = "mistral_instruct" 76 | elif "v1.6-34b" in model_name.lower(): 77 | conv_mode = "chatml_direct" 78 | elif "v1" in model_name.lower(): 79 | conv_mode = "llava_v1" 80 | elif "mpt" in model_name.lower(): 81 | conv_mode = "mpt" 82 | else: 83 | conv_mode = "llava_v0" 84 | 85 | if args.conv_mode is not None and conv_mode != args.conv_mode: 86 | print( 87 | "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( 88 | conv_mode, args.conv_mode, args.conv_mode 89 | ) 90 | ) 91 | else: 92 | args.conv_mode = conv_mode 93 | 94 | conv = conv_templates[args.conv_mode].copy() 95 | conv.append_message(conv.roles[0], qs) 96 | conv.append_message(conv.roles[1], None) 97 | prompt = conv.get_prompt() 98 | 99 | image_files = image_parser(args) 100 | images = load_images(image_files) 101 | image_sizes = [x.size for x in images] 102 | images_tensor = process_images( 103 | images, 104 | image_processor, 105 | model.config 106 | ).to(model.device, dtype=torch.float16) 107 | 108 | input_ids = ( 109 | tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") 110 | .unsqueeze(0) 111 | .cuda() 112 | ) 113 | 114 | with torch.inference_mode(): 115 | output_ids = model.generate( 116 | input_ids, 117 | images=images_tensor, 118 | image_sizes=image_sizes, 119 | do_sample=True if args.temperature > 0 else False, 120 | temperature=args.temperature, 121 | top_p=args.top_p, 122 | num_beams=args.num_beams, 123 | max_new_tokens=args.max_new_tokens, 124 | use_cache=True, 125 | ) 126 | 127 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 128 | print(outputs) 129 | 130 | 131 | if __name__ == "__main__": 132 | parser = argparse.ArgumentParser() 133 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 134 | parser.add_argument("--model-base", type=str, default=None) 135 | parser.add_argument("--image-file", type=str, required=True) 136 | parser.add_argument("--query", type=str, required=True) 137 | parser.add_argument("--conv-mode", type=str, default=None) 138 | parser.add_argument("--sep", type=str, default=",") 139 | parser.add_argument("--temperature", type=float, default=0.2) 140 | parser.add_argument("--top_p", type=float, default=None) 141 | parser.add_argument("--num_beams", type=int, default=1) 142 | parser.add_argument("--max_new_tokens", type=int, default=512) 143 | args = parser.parse_args() 144 | 145 | eval_model(args) 146 | -------------------------------------------------------------------------------- /llava/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /llava/eval/table/prompt.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for general questions"} 2 | {"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, "description": "Prompt for coding questions"} 3 | {"prompt_id": 3, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, "description": "Prompt for math questions"} 4 | {"prompt_id": 4, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Visual Context]\n{context}\n[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for visual questions"} 5 | -------------------------------------------------------------------------------- /llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /llava/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig 6 | except: 7 | pass 8 | -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading target model") 19 | auto_upgrade(target_model_path) 20 | target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 21 | 22 | print("Calculating delta") 23 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data -= base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 31 | bparam = base.state_dict()[name] 32 | param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam 33 | 34 | print("Saving delta") 35 | if hub_repo_id: 36 | kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} 37 | else: 38 | kwargs = {} 39 | target.save_pretrained(delta_path, **kwargs) 40 | target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) 41 | target_tokenizer.save_pretrained(delta_path, **kwargs) 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument("--base-model-path", type=str, required=True) 47 | parser.add_argument("--target-model-path", type=str, required=True) 48 | parser.add_argument("--delta-path", type=str, required=True) 49 | parser.add_argument("--hub-repo-id", type=str, default=None) 50 | args = parser.parse_args() 51 | 52 | make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) 53 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | from .siglip_encoder import SiglipVisionTower 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 10 | if use_s2: 11 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 12 | else: 13 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 14 | elif 'siglip' in vision_tower: 15 | return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 16 | raise ValueError(f'Unknown vision tower: {vision_tower}') 17 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | import warnings 3 | 4 | import torch 5 | 6 | import transformers 7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv 8 | 9 | try: 10 | from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func 11 | except ImportError: 12 | from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func 13 | from flash_attn.bert_padding import unpad_input, pad_input 14 | 15 | 16 | def forward( 17 | self, 18 | hidden_states: torch.Tensor, 19 | attention_mask: Optional[torch.Tensor] = None, 20 | position_ids: Optional[torch.Tensor] = None, 21 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 22 | output_attentions: bool = False, 23 | use_cache: bool = False, 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 25 | if output_attentions: 26 | warnings.warn( 27 | "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead." 28 | ) 29 | 30 | bsz, q_len, _ = hidden_states.size() 31 | 32 | query_states = ( 33 | self.q_proj(hidden_states) 34 | .view(bsz, q_len, self.num_heads, self.head_dim) 35 | .transpose(1, 2) 36 | ) 37 | key_states = ( 38 | self.k_proj(hidden_states) 39 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 40 | .transpose(1, 2) 41 | ) 42 | value_states = ( 43 | self.v_proj(hidden_states) 44 | .view(bsz, q_len, self.num_key_value_heads, self.head_dim) 45 | .transpose(1, 2) 46 | ) # shape: (b, num_heads, s, head_dim) 47 | 48 | kv_seq_len = key_states.shape[-2] 49 | if past_key_value is not None: 50 | kv_seq_len += past_key_value[0].shape[-2] 51 | 52 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 53 | query_states, key_states = apply_rotary_pos_emb( 54 | query_states, key_states, cos, sin, position_ids 55 | ) 56 | 57 | if past_key_value is not None: 58 | # reuse k, v 59 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 60 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 61 | 62 | past_key_value = (key_states, value_states) if use_cache else None 63 | 64 | # repeat k/v heads if n_kv_heads < n_heads 65 | key_states = repeat_kv(key_states, self.num_key_value_groups) 66 | value_states = repeat_kv(value_states, self.num_key_value_groups) 67 | 68 | # Transform the data into the format required by flash attention 69 | qkv = torch.stack([query_states, key_states, value_states], dim=2) 70 | qkv = qkv.transpose(1, 3) # shape: [b, s, 3, num_heads, head_dim] 71 | key_padding_mask = attention_mask 72 | 73 | if key_padding_mask is None: 74 | qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim) 75 | cu_q_lens = torch.arange( 76 | 0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device 77 | ) 78 | max_s = q_len 79 | output = flash_attn_unpadded_qkvpacked_func( 80 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 81 | ) 82 | output = output.view(bsz, q_len, -1) 83 | else: 84 | qkv = qkv.reshape(bsz, q_len, -1) 85 | qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask) 86 | qkv = qkv.view(-1, 3, self.num_heads, self.head_dim) 87 | output_unpad = flash_attn_unpadded_qkvpacked_func( 88 | qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True 89 | ) 90 | output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim) 91 | output = pad_input(output_unpad, indices, bsz, q_len) 92 | 93 | return self.o_proj(output), None, past_key_value 94 | 95 | 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention 97 | # requires the attention mask to be the same as the key_padding_mask 98 | def _prepare_decoder_attention_mask( 99 | self, attention_mask, input_shape, inputs_embeds, past_key_values_length 100 | ): 101 | # [bsz, seq_len] 102 | return attention_mask 103 | 104 | 105 | def replace_llama_attn_with_flash_attn(): 106 | cuda_major, cuda_minor = torch.cuda.get_device_capability() 107 | if cuda_major < 8: 108 | warnings.warn( 109 | "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward." 110 | "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593" 111 | ) 112 | transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( 113 | _prepare_decoder_attention_mask 114 | ) 115 | transformers.models.llama.modeling_llama.LlamaAttention.forward = forward 116 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /llava/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from llava.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True, encoding='UTF-8') 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /playground/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/playground/__init__.py -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/000_caps.txt: -------------------------------------------------------------------------------- 1 | A man wearing multiple neck ties making a goofy face. 2 | A man in a white shirt wearing very many ties. 3 | a man with ties on poses for a picture 4 | A man wearing multiple ties on his neck. 5 | A young man smiles while wearing several ties. 6 | 7 | tie: [0.574, 0.298, 0.752, 0.704] 8 | tie: [0.464, 0.339, 0.639, 0.789] 9 | tie: [0.349, 0.363, 0.563, 0.732] 10 | tie: [0.259, 0.255, 0.668, 0.805] 11 | person: [0.019, 0.065, 0.962, 0.988] 12 | person: [0.0, 0.24, 0.214, 1.0] 13 | tie: [0.316, 0.778, 0.443, 0.867] 14 | tie: [0.386, 0.707, 0.496, 0.801] 15 | tie: [0.251, 0.354, 0.402, 0.747] 16 | tie: [0.44, 0.318, 0.63, 0.397] 17 | tie: [0.566, 0.33, 0.655, 0.543] 18 | tie: [0.25, 0.409, 0.359, 0.556] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/000_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What is unusual about this photo? 3 | === 4 | Answer: 5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/001_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges do these people face? 3 | === 4 | Answer: 5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/002_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. 6 | 7 | fire hydrant: [0.326, 0.612, 0.426, 0.72] -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/002_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What challenges might this city face? 3 | === 4 | Answer: 5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services. -------------------------------------------------------------------------------- /playground/data/prompts/complex_reasoning/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail. 4 | 5 | Create complex questions beyond describing the scene. 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request. Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first. 7 | 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 9 | 10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/000_caps.txt: -------------------------------------------------------------------------------- 1 | There is a movie theater that displays the show times above the doors. 2 | A red fire hydrant is deep in the snow. 3 | The fire hydrant is in the snow near a recently plowed sidewalk. 4 | This city has had a very hard winter with snow. 5 | A hotel for dogs in the snow in winter. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/000_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What color is the fire hydrant in the image? 3 | === 4 | Answer: 5 | The fire hydrant in the image is red. 6 | === 7 | Question: 8 | Is the fire hydrant fully covered by snow or partially visible? 9 | === 10 | Answer: 11 | The fire hydrant is partially visible, with a significant portion of it covered by the deep snow. 12 | === 13 | Question: 14 | What is the condition of the sidewalk near the fire hydrant? 15 | === 16 | Answer: 17 | The sidewalk near the fire hydrant has been recently plowed, so it is relatively clear of snow compared to the surrounding area. 18 | === 19 | Question: 20 | What does the image suggest about the winter season in the city? 21 | === 22 | Answer: 23 | The image suggests that the winter season in the city has been quite harsh and challenging. The significant snow accumulation seen in the image indicates that the city has experienced heavy snowfall, which can lead to various difficulties and disruptions for its residents. 24 | 25 | For instance, the deep snow covering the fire hydrant could potentially hinder its accessibility in case of emergencies, which poses safety concerns. The recently plowed sidewalk implies that the city's maintenance crews have been actively working to keep the walkways clear and safe for pedestrians, but the sheer amount of snowfall might make it difficult to keep up with the continuous snow removal efforts. 26 | 27 | Furthermore, such extreme winter conditions can impact transportation, leading to traffic issues, delays in public transit, and increased risks of accidents due to slippery roads. It can also cause problems for businesses and schools, as people might face challenges commuting to work or attending classes. Additionally, the heavy snow can put extra strain on infrastructure, such as roofs and power lines, increasing the likelihood of structural damage or power outages. 28 | 29 | In conclusion, the image of the red fire hydrant deep in the snow and the recently plowed sidewalk suggest that the city has faced a particularly severe winter season, with substantial snowfall that has likely caused various challenges and disruptions for its residents and infrastructure. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/001_caps.txt: -------------------------------------------------------------------------------- 1 | A man is skiing in the open snow covered hills 2 | A skier is making his way into the snow. 3 | A skier on the foothills of a huge mountain range. 4 | A skier looks at mountains while standing near a trail sign. 5 | a single person skiing by an area with a lot of bushes -------------------------------------------------------------------------------- /playground/data/prompts/conversation/001_conv.txt: -------------------------------------------------------------------------------- 1 | Question: 2 | What is the skier doing? 3 | === 4 | Answer: 5 | The skier is skiing in the snow-covered hills, making his way through the snow and enjoying the beautiful mountain scenery. They are also standing near a trail sign, which indicates their location on the mountain. 6 | === 7 | Question: 8 | Are there any other people or skiers visible in the image? 9 | === 10 | Answer: 11 | No, there is only a single person skiing in the image, and they are surrounded by an area with a lot of bushes. 12 | === 13 | Question: 14 | What kind of skiing is the person doing? Cross-country or downhill? 15 | === 16 | Answer: 17 | Based on what I can see in the photo, it is not possible to definitively determine whether the skier is doing cross-country or downhill skiing. However, given the presence of a trail sign and the fact that they are skiing in the foothills, it is more likely that the skier is engaged in cross-country skiing. Downhill skiing typically takes place on more steep slopes and involves ski lifts, whereas cross-country skiing often occurs on relatively flatter terrain with marked trails. 18 | === 19 | Question: 20 | Can you explain cross-country and downhill skiing a bit more? 21 | === 22 | Answer: 23 | Cross-country skiing and downhill skiing are two distinct types of skiing, each with its own techniques, equipment, and terrain preferences. 24 | 25 | 1. Cross-Country Skiing: 26 | Also known as Nordic skiing, cross-country skiing involves traversing relatively flat terrain or gentle slopes using skis and poles. It is a popular recreational and competitive sport that focuses on endurance, balance, and aerobic fitness. Cross-country skiers use their own locomotion to move across snow-covered terrain, rather than relying on ski lifts or other forms of assistance. 27 | There are two main techniques in cross-country skiing: 28 | - Classic: This technique involves skiing in parallel tracks, using a forward-and-backward gliding motion while keeping the skis parallel to each other. The skier uses their poles to propel themselves forward. 29 | - Skate skiing: This technique is similar to ice skating, where the skier pushes off one ski at an angle and glides on the other. It requires a smooth, groomed surface and is typically faster than the classic technique. 30 | Cross-country ski equipment consists of lightweight skis, boots, bindings, and poles. The boots are more flexible than those used in downhill skiing, allowing for better ankle movement and control. 31 | 32 | 2. Downhill Skiing: 33 | Also known as alpine skiing, downhill skiing involves descending slopes at high speeds using skis and poles for balance and control. This sport is more focused on speed, technique, and navigating challenging terrain, including steep slopes, moguls, and even jumps. 34 | Downhill skiing can be further categorized into several disciplines, such as slalom, giant slalom, super-G, and downhill racing. Each discipline has its own set of rules, courses, and ski equipment. 35 | Downhill ski equipment includes heavier and stiffer skis, boots, bindings, and poles than those used in cross-country skiing. The boots are more rigid to provide better support and control during high-speed descents and sharp turns. 36 | 37 | In summary, cross-country skiing is an endurance-based sport that involves traveling across flat or gently sloping terrain, while downhill skiing is focused on speed and technique as skiers navigate steeper slopes and challenging terrain. Both sports require specialized equipment and techniques, but they offer different experiences and challenges to participants. -------------------------------------------------------------------------------- /playground/data/prompts/conversation/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image. 2 | 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. 4 | Ask diverse questions and give corresponding answers. 5 | 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers: 7 | (1) one can see the content in the image that the question asks about and can answer confidently; 8 | (2) one can determine confidently from the image that it is not in the image. 9 | Do not ask any question that cannot be answered confidently. 10 | 11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details. 12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. You can include multiple paragraphs if necessary. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/000_caps.txt: -------------------------------------------------------------------------------- 1 | A harbor filled with lots of boats next to a building. 2 | A bicycle parked in front of several boats at a dock. 3 | A red bicycle in front of a line of docked white yachts 4 | A bike sits before boats which sit before a long building. 5 | A bicycle is a convenient means of land transportation when you live on a boat. 6 | 7 | bicycle: [0.287, 0.641, 0.507, 0.874] 8 | bicycle: [0.566, 0.667, 0.63, 0.731] 9 | boat: [0.318, 0.579, 0.575, 0.724] 10 | boat: [0.704, 0.607, 0.818, 0.727] 11 | boat: [0.818, 0.601, 0.942, 0.744] 12 | boat: [0.002, 0.53, 0.243, 0.71] 13 | boat: [0.541, 0.611, 0.668, 0.731] 14 | person: [0.778, 0.527, 0.797, 0.57] 15 | cup: [0.708, 0.733, 0.724, 0.758] 16 | boat: [0.236, 0.532, 0.404, 0.64] 17 | boat: [0.81, 0.632, 0.836, 0.676] 18 | boat: [0.957, 0.526, 1.0, 0.752] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/000_conv.txt: -------------------------------------------------------------------------------- 1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor. 2 | 3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/001_caps.txt: -------------------------------------------------------------------------------- 1 | A group of people standing outside of a black vehicle with various luggage. 2 | Luggage surrounds a vehicle in an underground parking area 3 | People try to fit all of their luggage in an SUV. 4 | The sport utility vehicle is parked in the public garage, being packed for a trip 5 | Some people with luggage near a van that is transporting it. 6 | 7 | person: [0.681, 0.242, 0.774, 0.694] 8 | person: [0.63, 0.222, 0.686, 0.516] 9 | person: [0.444, 0.233, 0.487, 0.34] 10 | backpack: [0.384, 0.696, 0.485, 0.914] 11 | backpack: [0.755, 0.413, 0.846, 0.692] 12 | suitcase: [0.758, 0.413, 0.845, 0.69] 13 | suitcase: [0.1, 0.497, 0.173, 0.579] 14 | bicycle: [0.282, 0.363, 0.327, 0.442] 15 | car: [0.786, 0.25, 0.848, 0.322] 16 | car: [0.783, 0.27, 0.827, 0.335] 17 | car: [0.86, 0.254, 0.891, 0.3] 18 | car: [0.261, 0.101, 0.787, 0.626] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/001_conv.txt: -------------------------------------------------------------------------------- 1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 2 | 3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 4 | 5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/002_caps.txt: -------------------------------------------------------------------------------- 1 | A man holds a Wii-mote above his head while another looks on. 2 | A guy and his friend are playing Nintendo Wii. 3 | A young man is holding a video game remote over his head. 4 | two men standing in a room while one plays with a wii mote 5 | Some guys standing and playing a video game. 6 | 7 | couch: [0.697, 0.759, 0.995, 1.0] 8 | dining table: [0.426, 0.755, 1.0, 0.987] 9 | person: [0.082, 0.252, 0.342, 1.0] 10 | person: [0.399, 0.085, 0.742, 0.982] 11 | remote: [0.477, 0.135, 0.516, 0.187] 12 | sink: [0.016, 0.501, 0.063, 0.52] 13 | potted plant: [0.798, 0.384, 0.888, 0.645] 14 | refrigerator: [0.305, 0.389, 0.414, 0.547] 15 | chair: [0.72, 0.509, 0.858, 0.725] -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/002_conv.txt: -------------------------------------------------------------------------------- 1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 2 | 3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background. -------------------------------------------------------------------------------- /playground/data/prompts/detail_description/system_message.txt: -------------------------------------------------------------------------------- 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. 2 | 3 | Using the provided caption and bounding box information, describe the scene in a detailed manner. 4 | 5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects. 6 | 7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box. Always answer as if you are directly looking at the image. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "maya" 7 | version = "1.0" 8 | description = "Maya: An Instruction Finetuned Multilingual Multimodal Model." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | 16 | dependencies = [ 17 | "torch==2.1.2", "torchvision==0.16.2", 18 | "transformers==4.47.0", "tokenizers==0.21", "sentencepiece==0.1.99", "shortuuid==1.0.13", 19 | "accelerate==0.27.2", "datasets==2.15.0", "peft==0.12.0", "bitsandbytes==0.43.3", 20 | "pydantic==2.8.2", "markdown2[all]", "numpy==1.26.4", "scikit-learn==1.2.2", 21 | "gradio==4.16.0", "gradio_client==0.8.1", 22 | "requests==2.32.3", "httpx==0.24.0", "uvicorn", "fastapi", 23 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", 24 | ] 25 | 26 | [project.optional-dependencies] 27 | train = ["deepspeed==0.14.2", "ninja", "wandb==0.17.8"] 28 | build = ["build", "twine"] 29 | 30 | [project.urls] 31 | "Homepage" = "https://huggingface.co/maya-multimodal" 32 | "Bug Tracker" = "https://github.com/nahidalam/maya/issues" 33 | 34 | [tool.setuptools.packages.find] 35 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 36 | 37 | [tool.wheel] 38 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 39 | -------------------------------------------------------------------------------- /requirements.sh: -------------------------------------------------------------------------------- 1 | pip install chardet==5.2.0 2 | pip install datasets==2.15.0 3 | pip install deepspeed==0.14.2 4 | pip install fastapi==0.111.0 5 | pip install transformers==4.42.3 6 | pip install accelerate==0.27.2 7 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /scripts/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fire 4 | import re 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot 6 | 7 | 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"): 9 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 10 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 11 | 12 | split_problems = build_prompt_chatbot( 13 | problems, split_indices, prompt_format, 14 | use_caption=False, is_test=False) 15 | 16 | target_format = [] 17 | for prob_id, (input, output) in split_problems.items(): 18 | if input.startswith('Question: '): 19 | input = input.replace('Question: ', '') 20 | if output.startswith('Answer: '): 21 | output = output.replace('Answer: ', '') 22 | 23 | raw_prob_data = problems[prob_id] 24 | if raw_prob_data['image'] is None: 25 | target_format.append({ 26 | "id": prob_id, 27 | "conversations": [ 28 | {'from': 'human', 'value': f"{input}"}, 29 | {'from': 'gpt', 'value': f"{output}"}, 30 | ], 31 | }) 32 | 33 | else: 34 | target_format.append({ 35 | "id": prob_id, 36 | "image": os.path.join(prob_id, raw_prob_data['image']), 37 | "conversations": [ 38 | {'from': 'human', 'value': f"{input}\n"}, 39 | {'from': 'gpt', 'value': f"{output}"}, 40 | ], 41 | }) 42 | 43 | print(f'Number of samples: {len(target_format)}') 44 | 45 | with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f: 46 | json.dump(target_format, f, indent=2) 47 | 48 | 49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"): 50 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split] 51 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 52 | 53 | split_problems = build_prompt_chatbot( 54 | problems, split_indices, prompt_format, 55 | use_caption=False, is_test=False) 56 | 57 | writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w") 58 | for prob_id, (input, output) in split_problems.items(): 59 | if input.startswith('Question: '): 60 | input = input.replace('Question: ', '') 61 | if output.startswith('Answer: '): 62 | output = output.replace('Answer: ', '') 63 | 64 | raw_prob_data = problems[prob_id] 65 | if raw_prob_data['image'] is None: 66 | data = { 67 | "id": prob_id, 68 | "instruction": f"{input}", 69 | "output": f"{output}", 70 | } 71 | 72 | else: 73 | data = { 74 | "id": prob_id, 75 | "image": os.path.join(prob_id, raw_prob_data['image']), 76 | "instruction": f"{input}\n", 77 | "output": f"{output}", 78 | } 79 | writer.write(json.dumps(data) + '\n') 80 | writer.close() 81 | 82 | 83 | def main(task, **kwargs): 84 | globals()[task](**kwargs) 85 | 86 | 87 | if __name__ == "__main__": 88 | fire.Fire(main) 89 | -------------------------------------------------------------------------------- /scripts/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--annotation-file', type=str, required=True) 11 | parser.add_argument('--result-file', type=str, required=True) 12 | parser.add_argument('--result-upload-file', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | 30 | # Map question_id (concatenation of image + question) to result text 31 | results = {x['question_id']: x['text'] for x in results} 32 | 33 | # Load the entire annotation file (as it's a JSON array) 34 | with open(args.annotation_file, 'r') as f: 35 | test_split = json.load(f) 36 | 37 | # Generate question_id by concatenating image + _ + question 38 | split_ids = set([x['image'] +"_"+ x['question'] for x in test_split]) 39 | 40 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 41 | 42 | all_answers = [] 43 | 44 | answer_processor = EvalAIAnswerProcessor() 45 | 46 | for x in test_split: 47 | question_id = x['question'] + "_"+ x['image'] 48 | try: 49 | assert question_id in results 50 | except: 51 | break 52 | all_answers.append({ 53 | 'image': x['image'], 54 | 'question' : x['question'], 55 | 'answer': answer_processor(results[question_id]) 56 | }) 57 | 58 | # Save the processed answers 59 | with open(args.result_upload_file, 'w') as f: 60 | json.dump(all_answers, f) 61 | -------------------------------------------------------------------------------- /scripts/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument('--ckpt', type=str, required=True) 12 | parser.add_argument('--split', type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl') 21 | test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl') 22 | dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json') 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x['question_id']: x['text'] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x['question_id'] for x in test_split]) 36 | 37 | print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x['question_id'] not in results: 45 | all_answers.append({ 46 | 'question_id': x['question_id'], 47 | 'answer': '' 48 | }) 49 | else: 50 | all_answers.append({ 51 | 'question_id': x['question_id'], 52 | 'answer': answer_processor(results[x['question_id']]) 53 | }) 54 | 55 | with open(dst, 'w') as f: 56 | json.dump(all_answers, open(dst, 'w')) 57 | -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /scripts/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_80k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_full_schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_158k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 30 | --num_train_epochs 3 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --dataloader_num_workers 4 \ 47 | --lazy_preprocess True \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 21 | --version $PROMPT_VERSION \ 22 | --data_path ./playground/data/llava_instruct_80k.json \ 23 | --image_folder /path/to/coco/train2017 \ 24 | --vision_tower openai/clip-vit-large-patch14 \ 25 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 26 | --mm_vision_select_layer -2 \ 27 | --mm_use_im_start_end False \ 28 | --mm_use_im_patch_token False \ 29 | --bf16 True \ 30 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 16 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 50000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 2048 \ 46 | --gradient_checkpointing True \ 47 | --lazy_preprocess True \ 48 | --dataloader_num_workers 4 \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/finetune_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | ################## VICUNA ################## 8 | # PROMPT_VERSION=v1 9 | # MODEL_VERSION="vicuna-v1-3-7b" 10 | ################## VICUNA ################## 11 | 12 | ################## LLaMA-2 ################## 13 | # PROMPT_VERSION="llava_llama_2" 14 | # MODEL_VERSION="llama-2-7b-chat" 15 | ################## LLaMA-2 ################## 16 | 17 | deepspeed llava/train/train_mem.py \ 18 | --deepspeed ./scripts/zero2.json \ 19 | --lora_enable True \ 20 | --bits 4 \ 21 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 22 | --version $PROMPT_VERSION \ 23 | --data_path ./playground/data/llava_instruct_80k.json \ 24 | --image_folder /path/to/coco/train2017 \ 25 | --vision_tower openai/clip-vit-large-patch14 \ 26 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 27 | --mm_vision_select_layer -2 \ 28 | --mm_use_im_start_end False \ 29 | --mm_use_im_patch_token False \ 30 | --bf16 True \ 31 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 32 | --num_train_epochs 1 \ 33 | --per_device_train_batch_size 16 \ 34 | --per_device_eval_batch_size 4 \ 35 | --gradient_accumulation_steps 1 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 50000 \ 39 | --save_total_limit 1 \ 40 | --learning_rate 2e-5 \ 41 | --weight_decay 0. \ 42 | --warmup_ratio 0.03 \ 43 | --lr_scheduler_type "cosine" \ 44 | --logging_steps 1 \ 45 | --tf32 True \ 46 | --model_max_length 2048 \ 47 | --gradient_checkpointing True \ 48 | --lazy_preprocess True \ 49 | --dataloader_num_workers 4 \ 50 | --report_to wandb 51 | -------------------------------------------------------------------------------- /scripts/finetune_sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | deepspeed llava/train/train_mem.py \ 6 | --deepspeed ./scripts/zero2.json \ 7 | --model_name_or_path lmsys/vicuna-13b-v1.3 \ 8 | --version $PROMPT_VERSION \ 9 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ 10 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \ 11 | --vision_tower openai/clip-vit-large-patch14 \ 12 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ 18 | --num_train_epochs 12 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /scripts/maya/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="maya_full_ft" 9 | # CKPT="maya_toxicity_free_finetuned" 10 | SPLIT="llava_gqa_testdev_balanced" 11 | GQADIR="./playground/data/eval/gqa/data" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path nahidalam/$CKPT \ 16 | --model-base CohereForAI/aya-23-8B \ 17 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 18 | --image-folder ./playground/data/eval/gqa/data/images \ 19 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --conv-mode aya & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 39 | 40 | cd $GQADIR 41 | python eval/eval.py --tier testdev_balanced 42 | -------------------------------------------------------------------------------- /scripts/maya/eval/load_mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)" 4 | 5 | LOAD_MME_PY="$PROJECT_ROOT/llava/eval/maya/load_mme.py" 6 | 7 | 8 | cd "$PROJECT_ROOT" 9 | 10 | python "$LOAD_MME_PY" 11 | 12 | -------------------------------------------------------------------------------- /scripts/maya/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path nahidalam/maya_full_ft \ 7 | --model-base CohereForAI/aya-23-8B \ 8 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/maya_full_ft.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode aya 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment maya_full_ft 21 | -------------------------------------------------------------------------------- /scripts/maya/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)" 4 | 5 | # LOAD_MME_PY="$PROJECT_ROOT/llava/eval/maya/load_mme.py" 6 | MODEL_VQA_LOADER_PY="$PROJECT_ROOT/llava/eval/model_vqa_loader.py" 7 | CONVERT_ANSWER_TO_MME_PY="$PROJECT_ROOT/llava/eval/maya/convert_answer_to_mme.py" 8 | MME_CALCULATE_PY="$PROJECT_ROOT/llava/eval/maya/calculate_mme.py" 9 | 10 | 11 | cd "$PROJECT_ROOT" 12 | 13 | # Run this line once if you want to load the data from Hugging Face (caches to disk). load_mme.sh is the equivalent 14 | # python "$LOAD_MME_PY 15 | 16 | 17 | python -m llava.eval.model_vqa_loader \ 18 | --model-path "nahidalam/maya_full_ft" \ 19 | --model-base "CohereForAI/aya-23-8B" \ 20 | --question-file "./playground/data/eval/MME/llava_mme.jsonl" \ 21 | --image-folder "./playground/data/eval/MME/MME_Benchmark_release_version" \ 22 | --answers-file "./playground/data/eval/MME/answers/maya_full_ft.jsonl" \ 23 | --temperature "0" \ 24 | --conv-mode "aya" 25 | 26 | 27 | cd "$PROJECT_ROOT/playground/data/eval/MME" 28 | 29 | python "$CONVERT_ANSWER_TO_MME_PY" \ 30 | --experiment "maya_full_ft" 31 | 32 | 33 | cd "$PROJECT_ROOT/llava/eval/maya" 34 | 35 | python "$MME_CALCULATE_PY" \ 36 | --results_dir "../../../playground/data/eval/MME/eval_tool/answers/maya_full_ft" 37 | 38 | -------------------------------------------------------------------------------- /scripts/maya/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path nahidalam/maya_full_ft \ 5 | --model-base CohereForAI/aya-23-8B \ 6 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 7 | --image-folder ./playground/data/eval/mm-vet/images \ 8 | --answers-file ./playground/data/eval/mm-vet/answers/maya_full_ft.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode aya 11 | 12 | mkdir -p ./playground/data/eval/mm-vet/results_maya 13 | 14 | python scripts/convert_mmvet_for_eval.py \ 15 | --src ./playground/data/eval/mm-vet/answers/maya_full_ft.jsonl \ 16 | --dst ./playground/data/eval/mm-vet/results_maya/maya_full_ft.json 17 | 18 | -------------------------------------------------------------------------------- /scripts/maya/eval/palo_multilingual_llava_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Evaluates Maya on the PALO Multilingual-LLaVA Bench-In-The-Wild dataset. 4 | # 5 | # Usage: 6 | # bash scripts/maya/eval/palo_multilingual_llava_bench.sh \ 7 | # [PROJECTOR_PATH] 8 | # 9 | # Note: 10 | # - MODE: Choose either 'pretrained' or 'finetuned' (without quotes). Example: finetuned 11 | # - PROJECTOR_PATH: required if MODE=pretrained 12 | # - See Readme for more details 13 | 14 | 15 | 16 | export MULTILINGUAL_LLAVA_BENCH_PATH="playground/data/eval/multilingual-llava-bench-in-the-wild" 17 | export OUTPUT_DIR="evaluation" 18 | export IMAGES="$MULTILINGUAL_LLAVA_BENCH_PATH/images" 19 | export PYTHONPATH="./:$PYTHONPATH" 20 | 21 | 22 | MODEL_BASE=$1 23 | MODEL_PATH=$2 24 | MODE=$3 25 | export OPENAI_API_KEY=$4 26 | PROJECTOR_PATH=$5 27 | 28 | 29 | evaluate_language() { 30 | local LANG=$1 31 | local QUESTIONS=$2 32 | local ANSWERS=$3 33 | local CONTEXT=$4 34 | local OUTPUT_FILE="Maya_${LANG}.jsonl" 35 | 36 | echo "******** Evaluating Maya on $LANG ********" 37 | 38 | cmd="python llava/eval/model_vqa_maya.py \ 39 | --model-base "$MODEL_BASE" \ 40 | --model-path "$MODEL_PATH" \ 41 | --mode $MODE \ 42 | --conv-mode aya \ 43 | --question-file "$QUESTIONS" \ 44 | --image-folder "$IMAGES" \ 45 | --answers-file "$OUTPUT_DIR/$OUTPUT_FILE" \ 46 | --temperature 0" 47 | 48 | # Add projector path if provided 49 | if [ ! -z "$PROJECTOR_PATH" ]; then 50 | cmd+=" --projector-path \"$PROJECTOR_PATH\"" 51 | fi 52 | 53 | # Execute the command 54 | eval $cmd 55 | 56 | mkdir -p "$OUTPUT_DIR/reviews" 57 | 58 | python llava/eval/eval_gpt_review_bench.py \ 59 | --question "$QUESTIONS" \ 60 | --context "$CONTEXT" \ 61 | --rule llava/eval/table/rule.json \ 62 | --answer-list \ 63 | "$ANSWERS" \ 64 | "$OUTPUT_DIR/$OUTPUT_FILE" \ 65 | --output \ 66 | "$OUTPUT_DIR/reviews/$OUTPUT_FILE" 67 | 68 | python llava/eval/summarize_gpt_review.py -f "$OUTPUT_DIR/reviews/$OUTPUT_FILE" 69 | 70 | } 71 | 72 | # Evaluate each language 73 | # 1.English 74 | evaluate_language "English" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/questions.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/answers_gpt4.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/context.jsonl" 75 | 76 | # 2.Chinese 77 | evaluate_language "Chinese" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/context.jsonl" 78 | 79 | # 3.Spanish 80 | evaluate_language "Spanish" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/context_spanish.jsonl" 81 | 82 | # 4.French 83 | evaluate_language "French" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/context.jsonl" 84 | 85 | # 5.Russian 86 | evaluate_language "Russian" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/context.jsonl" 87 | 88 | # 6.Arabic 89 | evaluate_language "Arabic" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/context.jsonl" 90 | 91 | # 7.Bengali -- Not currently supported 92 | evaluate_language "Bengali" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/context.jsonl" 93 | 94 | # 8.Hindi 95 | evaluate_language "Hindi" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/context.jsonl" 96 | 97 | # 9.Urdu -- Not currently supported 98 | evaluate_language "Urdu" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/context.jsonl" 99 | 100 | # 10.Japanese 101 | evaluate_language "Japanese" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/context.jsonl" 102 | -------------------------------------------------------------------------------- /scripts/maya/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m llava.eval.model_vqa_loader \ 3 | --model-path nahidalam/maya_full_ft \ 4 | --model-base CohereForAI/aya-23-8B \ 5 | --question-file ./playground/data/eval/pope/maya_pope_test.jsonl \ 6 | --image-folder ./playground/data/eval/pope/val2014 \ 7 | --answers-file ./playground/data/eval/pope/answers/maya_full_ft.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode aya 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir ./playground/data/eval/pope/coco \ 13 | --question-file ./playground/data/eval/pope/maya_pope_test.jsonl \ 14 | --result-file ./playground/data/eval/pope/answers/maya_full_ft.jsonl 15 | -------------------------------------------------------------------------------- /scripts/maya/eval/process_prediction.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Load the Excel file 4 | file_path = './playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712/maya_full_ft.xlsx' 5 | excel_data = pd.read_excel(file_path) 6 | 7 | # Extract the 'prediction' and 'answer' columns 8 | predictions = excel_data['prediction'] 9 | answers = excel_data['answer'] 10 | 11 | # Calculate accuracy by comparing predictions to answers 12 | accuracy = (predictions == answers).mean() 13 | 14 | # Print the accuracy 15 | print(f"Accuracy: {accuracy * 100:.2f}%") 16 | 17 | -------------------------------------------------------------------------------- /scripts/maya/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path nahidalam/maya_full_ft \ 5 | --model-base CohereForAI/aya-23-8B \ 6 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 7 | --image-folder ./playground/data/eval/scienceqa/images/test \ 8 | --answers-file ./playground/data/eval/scienceqa/answers/maya_full_ft.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | python llava/eval/eval_science_qa.py \ 14 | --base-dir ./playground/data/eval/scienceqa \ 15 | --result-file ./playground/data/eval/scienceqa/answers/maya_full_ft.jsonl \ 16 | --output-file ./playground/data/eval/scienceqa/answers/maya_full_ft_output.jsonl \ 17 | --output-result ./playground/data/eval/scienceqa/answers/maya_full_ft_result.json 18 | -------------------------------------------------------------------------------- /scripts/maya/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path nahidalam/maya_full_ft \ 5 | --model-base CohereForAI/aya-23-8B \ 6 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 7 | --image-folder ./playground/data/eval/textvqa/train_images \ 8 | --answers-file ./playground/data/eval/textvqa/answers/maya_full_ft.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python -m llava.eval.eval_textvqa \ 13 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 14 | --result-file ./playground/data/eval/textvqa/answers/maya_full_ft.jsonl 15 | -------------------------------------------------------------------------------- /scripts/maya/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_vizwiz \ 4 | --model-path nahidalam/maya_full_ft \ 5 | --model-base CohereForAI/aya-23-8B \ 6 | --question-file ./playground/data/eval/vizwiz/test.json \ 7 | --image-folder ./playground/data/eval/vizwiz/test \ 8 | --answers-file ./playground/data/eval/vizwiz/answers/maya_test.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode aya 11 | 12 | python -m llava.eval.model_vqa_loader \ 13 | --model-path nahidalam/maya_full_ft \ 14 | --model-base CohereForAI/aya-23-8B \ 15 | --question-file ./playground/data/eval/vizwiz/llava_test_short.jsonl \ 16 | --image-folder ./playground/data/eval/vizwiz/test \ 17 | --answers-file ./playground/data/eval/vizwiz/answers/maya.jsonl \ 18 | --temperature 0 \ 19 | --conv-mode aya 20 | 21 | 22 | python scripts/convert_vizwiz_for_submission.py \ 23 | --annotation-file ./playground/data/eval/vizwiz/test.json \ 24 | --result-file ./playground/data/eval/vizwiz/answers/maya_test.jsonl \ 25 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/maya_full_ft.json 26 | -------------------------------------------------------------------------------- /scripts/maya/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="maya_full_ft" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path nahidalam/maya_full_ft \ 14 | --model-base CohereForAI/aya-23-8B \ 15 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/vqav2/test2015 \ 17 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode aya & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 37 | 38 | -------------------------------------------------------------------------------- /scripts/maya/finetune_aya_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path CohereForAI/aya-23-8B \ 6 | --version aya \ 7 | --data_path /dev/data/annotations/palo_multilingual_dataset.json \ 8 | --image_folder /dev/data/instruction_tune_dataset/ \ 9 | --vision_tower google/siglip-base-patch16-256-multilingual \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip-multi-8lang-32bz-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/maya \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 4 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/maya/finetune_aya_siglip2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path CohereForAI/aya-23-8B \ 6 | --version aya \ 7 | --data_path /dev/data/annotations/palo_multilingual_dataset.json \ 8 | --image_folder /dev/data/instruction_tune_dataset/ \ 9 | --vision_tower google/siglip2-base-patch16-256 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip2-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/maya \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 4 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/maya/pretrain_aya_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | MODEL_VERSION=aya-23-8b-siglip-multi 5 | 6 | ########### DO NOT CHANGE ########### 7 | ########### USE THIS FOR BOTH ########### 8 | PROMPT_VERSION=plain 9 | ########### DO NOT CHANGE ########### 10 | 11 | deepspeed llava/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path CohereForAI/aya-23-8B \ 14 | --version $PROMPT_VERSION \ 15 | --data_path /dev/data/LLaVA_Pretrain \ 16 | --image_folder /dev/data/images \ 17 | --vision_tower google/siglip-base-patch16-256-multilingual \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --bf16 True \ 24 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 4 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 8 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | -------------------------------------------------------------------------------- /scripts/maya/pretrain_aya_siglip2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | MODEL_VERSION=aya-23-8b-siglip2 5 | 6 | ########### DO NOT CHANGE ########### 7 | ########### USE THIS FOR BOTH ########### 8 | PROMPT_VERSION=plain 9 | ########### DO NOT CHANGE ########### 10 | 11 | deepspeed llava/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path CohereForAI/aya-23-8B \ 14 | --version $PROMPT_VERSION \ 15 | --data_path /dev/data/LLaVA_Pretrain \ 16 | --image_folder /dev/data/images \ 17 | --vision_tower google/siglip2-base-patch16-256 \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --bf16 True \ 24 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 4 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 8 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5! 4 | 5 | # Uncomment and set the following variables correspondingly to run this script: 6 | 7 | # MODEL_VERSION=vicuna-v1-3-7b 8 | # MODEL_VERSION=llama-2-7b-chat 9 | 10 | ########### DO NOT CHANGE ########### 11 | ########### USE THIS FOR BOTH ########### 12 | PROMPT_VERSION=plain 13 | ########### DO NOT CHANGE ########### 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 18 | --version $PROMPT_VERSION \ 19 | --data_path /path/to/pretrain_data.json \ 20 | --image_folder /path/to/images \ 21 | --vision_tower openai/clip-vit-large-patch14 \ 22 | --tune_mm_mlp_adapter True \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 28 | --num_train_epochs 1 \ 29 | --per_device_train_batch_size 16 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 1 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 24000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 2e-3 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 4 \ 45 | --lazy_preprocess True \ 46 | --report_to wandb 47 | -------------------------------------------------------------------------------- /scripts/pretrain_xformers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | # MODEL_VERSION=vicuna-v1-3-7b 6 | # MODEL_VERSION=llama-2-7b-chat 7 | 8 | ########### DO NOT CHANGE ########### 9 | ########### USE THIS FOR BOTH ########### 10 | PROMPT_VERSION=plain 11 | ########### DO NOT CHANGE ########### 12 | 13 | deepspeed llava/train/train_xformers.py \ 14 | --deepspeed ./scripts/zero2.json \ 15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 16 | --version $PROMPT_VERSION \ 17 | --data_path /path/to/pretrain_data.json \ 18 | --image_folder /path/to/images \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 False \ 25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 4 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 4 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 2e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 False \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | -------------------------------------------------------------------------------- /scripts/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /scripts/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /scripts/upload_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Step 0: Clean up 4 | rm -rf dist 5 | 6 | # Step 1: Change the package name to "llava-torch" 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml 8 | 9 | # Step 2: Build the package 10 | python -m build 11 | 12 | # Step 3: Revert the changes in pyproject.toml to the original 13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml 14 | 15 | # Step 4: Upload to PyPI 16 | python -m twine upload dist/* 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/eval_all_languages_deprecated.sh: -------------------------------------------------------------------------------- 1 | # This script for PALO Multilingual LLaVA Bench In-the-Wild Benchmark is deprecated. 2 | # Please use the new script at LLaVA/scripts/maya/eval/palo_multilingual_llava_bench.sh 3 | 4 | 5 | 6 | #!/bin/bash 7 | 8 | export MULTILINGUAL_LLAVA_BENCH_PATH="playground/data/eval/multilingual-llava-bench-in-the-wild" 9 | export OUTPUT_DIR="evaluation" 10 | export IMAGES="$MULTILINGUAL_LLAVA_BENCH_PATH/images" 11 | 12 | #export MODEL="/path/to/palo-v1.5-7b-665en_150K_of_arr_chi_hin_spa_ben_fr_jap_rus_ur" 13 | #export MODEL_NAME="palo-v1.5-7b-665en_150K_of_arr_chi_hin_spa_ben_fr_jap_rus_ur" 14 | #export OPENAI_API_KEY="write your open-ai key" 15 | 16 | MODEL_BASE=$1 17 | MODEL_PATH=$2 18 | PROJECTOR_PATH=$3 19 | MODEL_NAME=$4 20 | export OPENAI_API_KEY=$5 21 | 22 | export PYTHONPATH="./:$PYTHONPATH" 23 | 24 | # 1.English 25 | bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/questions.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/answers_gpt4.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_English.jsonl 26 | 27 | # 2.Chinese 28 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Chinese.jsonl 29 | 30 | # 3.Spanish 31 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Spanish.jsonl 32 | 33 | # 4.French 34 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_French.jsonl 35 | 36 | # 6.Russian 37 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Russian.jsonl 38 | 39 | # 7.Arabic 40 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Arabic.jsonl 41 | 42 | # 8.Bengali 43 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Bengali.jsonl 44 | 45 | # 9.Hindi 46 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Hindi.jsonl 47 | 48 | # 10.Urdu 49 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Urdu.jsonl 50 | 51 | # 11.Japanese 52 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Japanese.jsonl 53 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_gqa_testdev_balanced" 10 | GQADIR="./playground/data/eval/gqa/data" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path liuhaotian/llava-v1.5-13b \ 15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/gqa/data/images \ 17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 37 | 38 | cd $GQADIR 39 | python eval/eval.py --tier testdev_balanced 40 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/llavabench_palo_deprecated.sh: -------------------------------------------------------------------------------- 1 | # This script for PALO Multilingual LLaVA Bench In-the-Wild Benchmark is deprecated. 2 | # Please use the new script at LLaVA/scripts/maya/eval/palo_multilingual_llava_bench.sh 3 | 4 | 5 | 6 | #!/bin/bash 7 | 8 | IMAGES=$1 9 | MODEL_BASE=$2 10 | MODEL_PATH=$3 11 | PROJECTOR_PATH=$4 12 | QUESTIONS=$5 13 | ANSWERS=$6 14 | CONTEXT=$7 15 | OUTPUT_DIR=$8 16 | OUTPUT_FILE=$9 17 | 18 | 19 | python llava/eval/model_vqa_maya.py \ 20 | --model-base "$MODEL_BASE" \ 21 | --model-path "$MODEL_PATH" \ 22 | --projector-path "$PROJECTOR_PATH" \ 23 | --question-file "$QUESTIONS" \ 24 | --image-folder "$IMAGES" \ 25 | --answers-file "$OUTPUT_DIR/$OUTPUT_FILE" \ 26 | --temperature 0 \ 27 | --conv-mode aya 28 | 29 | mkdir -p "$OUTPUT_DIR/reviews" 30 | 31 | python llava/eval/eval_gpt_review_bench.py \ 32 | --question "$QUESTIONS" \ 33 | --context "$CONTEXT" \ 34 | --rule llava/eval/table/rule.json \ 35 | --answer-list \ 36 | "$ANSWERS" \ 37 | "$OUTPUT_DIR/$OUTPUT_FILE" \ 38 | --output \ 39 | "$OUTPUT_DIR/reviews/$OUTPUT_FILE" 40 | 41 | python llava/eval/summarize_gpt_review.py -f "$OUTPUT_DIR/reviews/$OUTPUT_FILE" 42 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_20230712" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --single-pred-prompt \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 14 | 15 | python scripts/convert_mmbench_for_submission.py \ 16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \ 18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 19 | --experiment llava-v1.5-13b 20 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPLIT="mmbench_dev_cn_20231003" 4 | 5 | python -m llava.eval.model_vqa_mmbench \ 6 | --model-path liuhaotian/llava-v1.5-13b \ 7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | --lang cn \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \ 19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \ 20 | --experiment llava-v1.5-13b 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \ 6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | cd ./playground/data/eval/MME 12 | 13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b 14 | 15 | cd eval_tool 16 | 17 | python calculation.py --results_dir answers/llava-v1.5-13b 18 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 6 | --image-folder ./playground/data/eval/mm-vet/images \ 7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p ./playground/data/eval/mm-vet/results 12 | 13 | python scripts/convert_mmvet_for_eval.py \ 14 | --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 15 | --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json 16 | 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 6 | --image-folder ./playground/data/eval/pope/val2014 \ 7 | --answers-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python llava/eval/eval_pope.py \ 12 | --annotation-dir ./playground/data/eval/pope/coco \ 13 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 14 | --result-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | echo "Evaluating in 'dev' split." 5 | elif [ "$1" = "test" ]; then 6 | echo "Evaluating in 'test' split." 7 | else 8 | echo "Unknown split, please choose between 'dev' and 'test'." 9 | exit 1 10 | fi 11 | 12 | python -m llava.eval.model_vqa_qbench \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 17 | --conv-mode llava_v1 \ 18 | --lang en 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | ZH_SPLIT="验证集" 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | ZH_SPLIT="测试集" 8 | echo "Evaluating in 'test' split." 9 | else 10 | echo "Unknown split, please choose between 'dev' and 'test'." 11 | exit 1 12 | fi 13 | 14 | python -m llava.eval.model_vqa_qbench \ 15 | --model-path liuhaotian/llava-v1.5-13b \ 16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 19 | --conv-mode llava_v1 \ 20 | --lang zh 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 12 | --model-path liuhaotian/llava-v1.5-13b \ 13 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \ 14 | --image-folder ./playground/data/eval/seed_bench \ 15 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl 25 | 26 | # Clear out the output file if it exists. 27 | > "$output_file" 28 | 29 | # Loop through the indices and concatenate each file. 30 | for IDX in $(seq 0 $((CHUNKS-1))); do 31 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 32 | done 33 | 34 | # Evaluate 35 | python scripts/convert_seed_for_submission.py \ 36 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \ 37 | --result-file $output_file \ 38 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl 39 | 40 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_science \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \ 6 | --image-folder ./playground/data/eval/scienceqa/images/test \ 7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 8 | --single-pred-prompt \ 9 | --temperature 0 \ 10 | --conv-mode vicuna_v1 11 | 12 | python llava/eval/eval_science_qa.py \ 13 | --base-dir ./playground/data/eval/scienceqa \ 14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \ 15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \ 16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json 17 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 6 | --image-folder ./playground/data/eval/textvqa/train_images \ 7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python -m llava.eval.eval_textvqa \ 12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 14 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa_loader \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 6 | --image-folder ./playground/data/eval/vizwiz/test \ 7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | python scripts/convert_vizwiz_for_submission.py \ 12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="llava-v1.5-13b" 9 | SPLIT="llava_vqav2_mscoco_test-dev2015" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 15 | --image-folder ./playground/data/eval/vqav2/test2015 \ 16 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 36 | 37 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_aya_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path CohereForAI/aya-23-8B \ 6 | --version aya \ 7 | --data_path /dev/data/annotations/palo_multilingual_dataset.json \ 8 | --image_folder /dev/data/instruction_tune_dataset/ \ 9 | --vision_tower google/siglip-base-patch16-256-multilingual \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip-multi-8lang-32bz-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/maya \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 4 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 4 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_llava_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_llava_siglip2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-5 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ 12 | --mm_projector_type mlp2x_gelu \ 13 | --mm_vision_select_layer -2 \ 14 | --mm_use_im_start_end False \ 15 | --mm_use_im_patch_token False \ 16 | --image_aspect_ratio pad \ 17 | --group_by_modality_length True \ 18 | --bf16 True \ 19 | --output_dir ./checkpoints/llava-v1.5-13b-lora \ 20 | --num_train_epochs 1 \ 21 | --per_device_train_batch_size 16 \ 22 | --per_device_eval_batch_size 4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --evaluation_strategy "no" \ 25 | --save_strategy "steps" \ 26 | --save_steps 50000 \ 27 | --save_total_limit 1 \ 28 | --learning_rate 2e-4 \ 29 | --weight_decay 0. \ 30 | --warmup_ratio 0.03 \ 31 | --lr_scheduler_type "cosine" \ 32 | --logging_steps 1 \ 33 | --tf32 True \ 34 | --model_max_length 2048 \ 35 | --gradient_checkpointing True \ 36 | --dataloader_num_workers 4 \ 37 | --lazy_preprocess True \ 38 | --report_to wandb 39 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero3.json \ 5 | --model_name_or_path liuhaotian/llava-v1.5-13b \ 6 | --version v1 \ 7 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 8 | --image_folder ./playground/data \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --mm_vision_select_layer -2 \ 12 | --mm_use_im_start_end False \ 13 | --mm_use_im_patch_token False \ 14 | --image_aspect_ratio pad \ 15 | --group_by_modality_length True \ 16 | --bf16 True \ 17 | --output_dir ./checkpoints/llava-v1.5-13b-task \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 16 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 1 \ 22 | --evaluation_strategy "no" \ 23 | --save_strategy "steps" \ 24 | --save_steps 50000 \ 25 | --save_total_limit 1 \ 26 | --learning_rate 2e-5 \ 27 | --weight_decay 0. \ 28 | --warmup_ratio 0.03 \ 29 | --lr_scheduler_type "cosine" \ 30 | --logging_steps 1 \ 31 | --tf32 True \ 32 | --model_max_length 2048 \ 33 | --gradient_checkpointing True \ 34 | --dataloader_num_workers 4 \ 35 | --lazy_preprocess True \ 36 | --report_to wandb 37 | -------------------------------------------------------------------------------- /scripts/v1_5/finetune_task_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ 5 | --deepspeed ./scripts/zero3.json \ 6 | --model_name_or_path liuhaotian/llava-v1.5-13b \ 7 | --version v1 \ 8 | --data_path ./playground/data/llava_v1_5_mix665k.json \ 9 | --image_folder ./playground/data \ 10 | --vision_tower openai/clip-vit-large-patch14-336 \ 11 | --mm_projector_type mlp2x_gelu \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --image_aspect_ratio pad \ 16 | --group_by_modality_length True \ 17 | --bf16 True \ 18 | --output_dir ./checkpoints/llava-v1.5-13b-task-lora \ 19 | --num_train_epochs 1 \ 20 | --per_device_train_batch_size 16 \ 21 | --per_device_eval_batch_size 4 \ 22 | --gradient_accumulation_steps 1 \ 23 | --evaluation_strategy "no" \ 24 | --save_strategy "steps" \ 25 | --save_steps 50000 \ 26 | --save_total_limit 1 \ 27 | --learning_rate 2e-4 \ 28 | --weight_decay 0. \ 29 | --warmup_ratio 0.03 \ 30 | --lr_scheduler_type "cosine" \ 31 | --logging_steps 1 \ 32 | --tf32 True \ 33 | --model_max_length 2048 \ 34 | --gradient_checkpointing True \ 35 | --dataloader_num_workers 4 \ 36 | --lazy_preprocess True \ 37 | --report_to wandb 38 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \ 6 | --version plain \ 7 | --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./playground/data/LLaVA-Pretrain/images \ 9 | --vision_tower openai/clip-vit-large-patch14-336 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-13b-pretrain \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain_aya.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | MODEL_VERSION=aya-23-8b 5 | 6 | ########### DO NOT CHANGE ########### 7 | ########### USE THIS FOR BOTH ########### 8 | PROMPT_VERSION=plain 9 | ########### DO NOT CHANGE ########### 10 | 11 | deepspeed llava/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path CohereForAI/aya-23-8B \ 14 | --version $PROMPT_VERSION \ 15 | --data_path /home/ubuntu/LLaVA/playground/data/LLaVA-Pretrain/ \ 16 | --image_folder /home/ubuntu/LLaVA/playground/data/LLaVA-Pretrain/images \ 17 | --vision_tower openai/clip-vit-large-patch14-336 \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --bf16 True \ 24 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 8 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 1 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain_aya_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | MODEL_VERSION=aya-23-8b-siglip-multi 5 | 6 | ########### DO NOT CHANGE ########### 7 | ########### USE THIS FOR BOTH ########### 8 | PROMPT_VERSION=plain 9 | ########### DO NOT CHANGE ########### 10 | 11 | deepspeed llava/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path CohereForAI/aya-23-8B \ 14 | --version $PROMPT_VERSION \ 15 | --data_path /dev/data/LLaVA_Pretrain \ 16 | --image_folder /dev/data/images \ 17 | --vision_tower google/siglip-base-patch16-256-multilingual \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --bf16 True \ 24 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 4 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 8 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain_llava_siglip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-7b-v1.5 \ 6 | --version plain \ 7 | --data_path ./dev/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./dev/data/images \ 9 | --vision_tower google/siglip-base-patch16-256 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-7b-pretrain-siglip-base-patch16-256 \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /scripts/v1_5/pretrain_llava_siglip2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-7b-v1.5 \ 6 | --version plain \ 7 | --data_path ./dev/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \ 8 | --image_folder ./dev/data/images \ 9 | --vision_tower google/siglip2-base-patch16-256 \ 10 | --mm_projector_type mlp2x_gelu \ 11 | --tune_mm_mlp_adapter True \ 12 | --mm_vision_select_layer -2 \ 13 | --mm_use_im_start_end False \ 14 | --mm_use_im_patch_token False \ 15 | --bf16 True \ 16 | --output_dir ./checkpoints/llava-v1.5-7b-pretrain-siglip2-base-patch-16 \ 17 | --num_train_epochs 1 \ 18 | --per_device_train_batch_size 32 \ 19 | --per_device_eval_batch_size 4 \ 20 | --gradient_accumulation_steps 1 \ 21 | --evaluation_strategy "no" \ 22 | --save_strategy "steps" \ 23 | --save_steps 24000 \ 24 | --save_total_limit 1 \ 25 | --learning_rate 1e-3 \ 26 | --weight_decay 0. \ 27 | --warmup_ratio 0.03 \ 28 | --lr_scheduler_type "cosine" \ 29 | --logging_steps 1 \ 30 | --tf32 True \ 31 | --model_max_length 2048 \ 32 | --gradient_checkpointing True \ 33 | --dataloader_num_workers 4 \ 34 | --lazy_preprocess True \ 35 | --report_to wandb 36 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } --------------------------------------------------------------------------------