├── 001.jpg
├── 002.jpg
├── 011.jpg
├── 022.jpg
├── 023.jpg
├── 024.jpg
├── LICENSE
├── README.md
├── cog.yaml
├── evaluation
    ├── mmbench_maya_full_ft.txt
    ├── mmbench_maya_full_ft.xlsx
    ├── mme_results_dump.txt
    ├── mme_results_toxicity_finetuned.txt
    ├── palo_log_8_lang_maya_gpt4-0613.log
    ├── palo_log_8_lang_maya_toxicity_free_gpt4-0613.log
    ├── pope_results.txt
    ├── pope_results_toxicity_finetuned.txt
    ├── vizwiz_results.txt
    ├── vizwiz_results_toxicity_finetuned.txt
    └── vqav2_maya_full_ft.json
├── llava
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── claude_plane_test.jpeg
    │   ├── claude_plane_test_2.jpeg
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── maya
    │   │   ├── calculate_mme.py
    │   │   ├── convert_answer_to_mme.py
    │   │   ├── eval_utils.py
    │   │   └── load_mme.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_maya.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_science.py
    │   ├── model_vqa_vizwiz.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   ├── table
    │   │   ├── answer
    │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   ├── answer_bard.jsonl
    │   │   │   ├── answer_gpt35.jsonl
    │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   └── answer_vicuna-13b.jsonl
    │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   ├── model.jsonl
    │   │   ├── prompt.jsonl
    │   │   ├── question.jsonl
    │   │   ├── results
    │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   ├── review
    │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   ├── reviewer.jsonl
    │   │   └── rule.json
    │   ├── talk2maya.py
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── apply_delta.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   └── llava_cohere.py
    │   ├── llava_arch.py
    │   ├── make_delta.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   ├── clip_encoder.py
    │   │   └── siglip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── utils.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── playground
    ├── __init__.py
    ├── data
    │   ├── coco2014_val_gpt4_qa_30x3.jsonl
    │   ├── coco2014_val_qa_eval
    │   │   ├── qa90_gpt4_answer.jsonl
    │   │   └── qa90_questions.jsonl
    │   └── prompts
    │   │   ├── complex_reasoning
    │   │       ├── 000_caps.txt
    │   │       ├── 000_conv.txt
    │   │       ├── 001_caps.txt
    │   │       ├── 001_conv.txt
    │   │       ├── 002_caps.txt
    │   │       ├── 002_conv.txt
    │   │       └── system_message.txt
    │   │   ├── conversation
    │   │       ├── 000_caps.txt
    │   │       ├── 000_conv.txt
    │   │       ├── 001_caps.txt
    │   │       ├── 001_conv.txt
    │   │       └── system_message.txt
    │   │   └── detail_description
    │   │       ├── 000_caps.txt
    │   │       ├── 000_conv.txt
    │   │       ├── 001_caps.txt
    │   │       ├── 001_conv.txt
    │   │       ├── 002_caps.txt
    │   │       ├── 002_conv.txt
    │   │       └── system_message.txt
    ├── eval
    │   └── eval_utils.py
    └── finetuning
    │   ├── dataset_utils.py
    │   └── finetune_args.py
├── predict.py
├── pyproject.toml
├── requirements.sh
└── scripts
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── convert_sqa_to_llava.py
    ├── convert_sqa_to_llava_base_prompt.py
    ├── convert_vizwiz_for_submission.py
    ├── convert_vqav2_for_submission.py
    ├── extract_mm_projector.py
    ├── finetune.sh
    ├── finetune_full_schedule.sh
    ├── finetune_lora.sh
    ├── finetune_qlora.sh
    ├── finetune_sqa.sh
    ├── maya
        ├── eval
        │   ├── gqa.sh
        │   ├── load_mme.sh
        │   ├── mm-vet_evaluator.ipynb
        │   ├── mmbench.sh
        │   ├── mme.ipynb
        │   ├── mme.sh
        │   ├── mmvet.sh
        │   ├── mmvet_score_calc.ipynb
        │   ├── palo_multilingual_llava_bench.sh
        │   ├── pope.sh
        │   ├── process_prediction.py
        │   ├── sqa.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune_aya_siglip.sh
        ├── finetune_aya_siglip2.sh
        ├── pretrain_aya_siglip.sh
        └── pretrain_aya_siglip2.sh
    ├── merge_lora_weights.py
    ├── pretrain.sh
    ├── pretrain_xformers.sh
    ├── sqa_eval_batch.sh
    ├── sqa_eval_gather.sh
    ├── upload_pypi.sh
    ├── v1_5
        ├── eval
        │   ├── eval_all_languages_deprecated.sh
        │   ├── gqa.sh
        │   ├── llavabench.sh
        │   ├── llavabench_palo_deprecated.sh
        │   ├── mmbench.sh
        │   ├── mmbench_cn.sh
        │   ├── mme.sh
        │   ├── mmvet.sh
        │   ├── pope.sh
        │   ├── qbench.sh
        │   ├── qbench_zh.sh
        │   ├── seed.sh
        │   ├── sqa.sh
        │   ├── textvqa.sh
        │   ├── vizwiz.sh
        │   └── vqav2.sh
        ├── finetune.sh
        ├── finetune_aya_siglip.sh
        ├── finetune_llava_siglip.sh
        ├── finetune_llava_siglip2.sh
        ├── finetune_lora.sh
        ├── finetune_task.sh
        ├── finetune_task_lora.sh
        ├── pretrain.sh
        ├── pretrain_aya.sh
        ├── pretrain_aya_siglip.sh
        ├── pretrain_llava_siglip.sh
        └── pretrain_llava_siglip2.sh
    ├── zero2.json
    ├── zero3.json
    └── zero3_offload.json


/001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/001.jpg


--------------------------------------------------------------------------------
/002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/002.jpg


--------------------------------------------------------------------------------
/011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/011.jpg


--------------------------------------------------------------------------------
/022.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/022.jpg


--------------------------------------------------------------------------------
/023.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/023.jpg


--------------------------------------------------------------------------------
/024.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/024.jpg


--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 | 
 7 |   python_version: "3.11"
 8 | 
 9 |   python_packages:
10 |     - "torch==2.0.1"
11 |     - "accelerate==0.21.0"
12 |     - "bitsandbytes==0.41.0"
13 |     - "deepspeed==0.9.5"
14 |     - "einops-exts==0.0.4"
15 |     - "einops==0.6.1"
16 |     - "gradio==3.35.2"
17 |     - "gradio_client==0.2.9"
18 |     - "httpx==0.24.0"
19 |     - "markdown2==2.4.10"
20 |     - "numpy==1.26.0"
21 |     - "peft==0.4.0"
22 |     - "scikit-learn==1.2.2"
23 |     - "sentencepiece==0.1.99"
24 |     - "shortuuid==1.0.11"
25 |     - "timm==0.6.13"
26 |     - "tokenizers==0.13.3"
27 |     - "torch==2.0.1"
28 |     - "torchvision==0.15.2"
29 |     - "transformers==4.31.0"
30 |     - "wandb==0.15.12"
31 |     - "wavedrom==2.0.3.post3"
32 |     - "Pygments==2.16.1"
33 |   run:
34 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 | 
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/evaluation/mmbench_maya_full_ft.txt:
--------------------------------------------------------------------------------
1 | Accuracy: 71.10%
2 | Precision: 71.23%
3 | Recall: 71.10%
4 | F1 Score: 71.10%
5 | 


--------------------------------------------------------------------------------
/evaluation/mmbench_maya_full_ft.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/evaluation/mmbench_maya_full_ft.xlsx


--------------------------------------------------------------------------------
/evaluation/mme_results_dump.txt:
--------------------------------------------------------------------------------
  1 | WARNING: 1 ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval.
  2 | -------------Summary of Missing Prompts-------------
  3 | Category: code_reasoning, File: 0020.txt, Prompt: Is a c++ code shown in the picture?  Please answer yes or no.
  4 | 
  5 | 
  6 | 
  7 | ================= Perception =================
  8 | #-------existence-------#
  9 | TP: 28
 10 | FN: 2
 11 | TN: 28
 12 | FP: 2
 13 | precision: 0.9333333333333333
 14 | recall: 0.9333333333333333
 15 | f1: 0.9333333333333333
 16 | yes_ratio: 0.5
 17 | other_num: 0
 18 | acc: 0.9333333333333333
 19 | acc_plus: 0.8666666666666667
 20 | #-------count-------#
 21 | TP: 28
 22 | FN: 2
 23 | TN: 15
 24 | FP: 15
 25 | precision: 0.6511627906976745
 26 | recall: 0.9333333333333333
 27 | f1: 0.767123287671233
 28 | yes_ratio: 0.5
 29 | other_num: 0
 30 | acc: 0.7166666666666667
 31 | acc_plus: 0.4666666666666667
 32 | #-------position-------#
 33 | TP: 27
 34 | FN: 3
 35 | TN: 18
 36 | FP: 12
 37 | precision: 0.6923076923076923
 38 | recall: 0.9
 39 | f1: 0.7826086956521738
 40 | yes_ratio: 0.5
 41 | other_num: 0
 42 | acc: 0.75
 43 | acc_plus: 0.5666666666666667
 44 | #-------color-------#
 45 | TP: 28
 46 | FN: 2
 47 | TN: 21
 48 | FP: 9
 49 | precision: 0.7567567567567568
 50 | recall: 0.9333333333333333
 51 | f1: 0.835820895522388
 52 | yes_ratio: 0.5
 53 | other_num: 0
 54 | acc: 0.8166666666666667
 55 | acc_plus: 0.6333333333333333
 56 | #-------posters-------#
 57 | TP: 90
 58 | FN: 48
 59 | TN: 128
 60 | FP: 19
 61 | precision: 0.8256880733944955
 62 | recall: 0.6521739130434783
 63 | f1: 0.728744939271255
 64 | yes_ratio: 0.4842105263157895
 65 | other_num: 9
 66 | acc: 0.7414965986394558
 67 | acc_plus: 0.5306122448979592
 68 | #-------celebrity-------#
 69 | TP: 150
 70 | FN: 20
 71 | TN: 72
 72 | FP: 98
 73 | precision: 0.6048387096774194
 74 | recall: 0.8823529411764706
 75 | f1: 0.7177033492822966
 76 | yes_ratio: 0.5
 77 | other_num: 0
 78 | acc: 0.6529411764705882
 79 | acc_plus: 0.3588235294117647
 80 | #-------scene-------#
 81 | TP: 161
 82 | FN: 38
 83 | TN: 179
 84 | FP: 21
 85 | precision: 0.8846153846153846
 86 | recall: 0.8090452261306532
 87 | f1: 0.8451443569553806
 88 | yes_ratio: 0.49874686716791977
 89 | other_num: 1
 90 | acc: 0.85
 91 | acc_plus: 0.725
 92 | #-------landmark-------#
 93 | TP: 126
 94 | FN: 74
 95 | TN: 176
 96 | FP: 24
 97 | precision: 0.84
 98 | recall: 0.63
 99 | f1: 0.72
100 | yes_ratio: 0.5
101 | other_num: 0
102 | acc: 0.755
103 | acc_plus: 0.535
104 | #-------artwork-------#
105 | TP: 157
106 | FN: 36
107 | TN: 95
108 | FP: 89
109 | precision: 0.6382113821138211
110 | recall: 0.8134715025906736
111 | f1: 0.7152619589977222
112 | yes_ratio: 0.5119363395225465
113 | other_num: 23
114 | acc: 0.63
115 | acc_plus: 0.35
116 | #-------OCR-------#
117 | TP: 19
118 | FN: 1
119 | TN: 4
120 | FP: 16
121 | precision: 0.5428571428571428
122 | recall: 0.95
123 | f1: 0.6909090909090908
124 | yes_ratio: 0.5
125 | other_num: 0
126 | acc: 0.575
127 | acc_plus: 0.2
128 | 
129 | 
130 | Total Score: 1265.3873549419768 
131 | 
132 | existence score: 180.0
133 | count score: 118.33333333333334
134 | position score: 131.66666666666666
135 | color score: 145.0
136 | posters score: 127.2108843537415
137 | celebrity score: 101.1764705882353
138 | scene score: 157.5
139 | landmark score: 129.0
140 | artwork score: 98.0
141 | OCR score: 77.5
142 | 
143 | 
144 | ================= Cognition =================
145 | #-------commonsense_reasoning-------#
146 | TP: 53
147 | FN: 17
148 | TN: 53
149 | FP: 17
150 | precision: 0.7571428571428571
151 | recall: 0.7571428571428571
152 | f1: 0.7571428571428571
153 | yes_ratio: 0.5
154 | other_num: 0
155 | acc: 0.7571428571428571
156 | acc_plus: 0.5571428571428572
157 | #-------numerical_calculation-------#
158 | TP: 15
159 | FN: 5
160 | TN: 4
161 | FP: 16
162 | precision: 0.4838709677419355
163 | recall: 0.75
164 | f1: 0.5882352941176471
165 | yes_ratio: 0.5
166 | other_num: 0
167 | acc: 0.475
168 | acc_plus: 0.1
169 | #-------text_translation-------#
170 | TP: 3
171 | FN: 17
172 | TN: 18
173 | FP: 2
174 | precision: 0.6
175 | recall: 0.15
176 | f1: 0.24
177 | yes_ratio: 0.5
178 | other_num: 0
179 | acc: 0.525
180 | acc_plus: 0.1
181 | #-------code_reasoning-------#
182 | TP: 13
183 | FN: 7
184 | TN: 11
185 | FP: 9
186 | precision: 0.5909090909090909
187 | recall: 0.65
188 | f1: 0.6190476190476191
189 | yes_ratio: 0.5
190 | other_num: 0
191 | acc: 0.6
192 | acc_plus: 0.25
193 | 
194 | 
195 | Total Score: 336.42857142857144 
196 | 
197 | commonsense_reasoning score: 131.42857142857142
198 | numerical_calculation score: 57.5
199 | text_translation score: 62.5
200 | code_reasoning score: 85.0
201 | 
202 | 
203 | ==== Overall Metrics Across All Responses ====
204 | TP: 898
205 | FN: 272
206 | TN: 822
207 | FP: 349
208 | precision: 0.7201283079390537
209 | recall: 0.7675213675213676
210 | f1: 0.7430699213901532
211 | yes_ratio: 0.4997864160615122
212 | other_num: 33
213 | acc: 0.7245155855096883


--------------------------------------------------------------------------------
/evaluation/mme_results_toxicity_finetuned.txt:
--------------------------------------------------------------------------------
  1 | WARNING: 1 ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval.
  2 | -------------Summary of Missing Prompts-------------
  3 | Category: code_reasoning, File: 0020.txt, Prompt: Is a c++ code shown in the picture?  Please answer yes or no.
  4 | 
  5 | 
  6 | 
  7 | ================= Perception =================
  8 | #-------existence-------#
  9 | TP: 29
 10 | FN: 1
 11 | TN: 30
 12 | FP: 0
 13 | precision: 1.0
 14 | recall: 0.9666666666666667
 15 | f1: 0.983050847457627
 16 | yes_ratio: 0.5
 17 | other_num: 0
 18 | acc: 0.9833333333333333
 19 | acc_plus: 0.9666666666666667
 20 | #-------count-------#
 21 | TP: 26
 22 | FN: 4
 23 | TN: 20
 24 | FP: 10
 25 | precision: 0.7222222222222222
 26 | recall: 0.8666666666666667
 27 | f1: 0.7878787878787877
 28 | yes_ratio: 0.5
 29 | other_num: 0
 30 | acc: 0.7666666666666667
 31 | acc_plus: 0.5666666666666667
 32 | #-------position-------#
 33 | TP: 27
 34 | FN: 3
 35 | TN: 11
 36 | FP: 19
 37 | precision: 0.5869565217391305
 38 | recall: 0.9
 39 | f1: 0.7105263157894737
 40 | yes_ratio: 0.5
 41 | other_num: 0
 42 | acc: 0.6333333333333333
 43 | acc_plus: 0.36666666666666664
 44 | #-------color-------#
 45 | TP: 30
 46 | FN: 0
 47 | TN: 22
 48 | FP: 8
 49 | precision: 0.7894736842105263
 50 | recall: 1.0
 51 | f1: 0.8823529411764706
 52 | yes_ratio: 0.5
 53 | other_num: 0
 54 | acc: 0.8666666666666667
 55 | acc_plus: 0.7333333333333333
 56 | #-------posters-------#
 57 | TP: 88
 58 | FN: 58
 59 | TN: 132
 60 | FP: 14
 61 | precision: 0.8627450980392157
 62 | recall: 0.6027397260273972
 63 | f1: 0.7096774193548387
 64 | yes_ratio: 0.5
 65 | other_num: 2
 66 | acc: 0.7482993197278912
 67 | acc_plus: 0.54421768707483
 68 | #-------celebrity-------#
 69 | TP: 124
 70 | FN: 46
 71 | TN: 111
 72 | FP: 59
 73 | precision: 0.6775956284153005
 74 | recall: 0.7294117647058823
 75 | f1: 0.7025495750708215
 76 | yes_ratio: 0.5
 77 | other_num: 0
 78 | acc: 0.6911764705882353
 79 | acc_plus: 0.4588235294117647
 80 | #-------scene-------#
 81 | TP: 164
 82 | FN: 36
 83 | TN: 183
 84 | FP: 17
 85 | precision: 0.9060773480662984
 86 | recall: 0.82
 87 | f1: 0.8608923884514436
 88 | yes_ratio: 0.5
 89 | other_num: 0
 90 | acc: 0.8675
 91 | acc_plus: 0.755
 92 | #-------landmark-------#
 93 | TP: 61
 94 | FN: 139
 95 | TN: 199
 96 | FP: 1
 97 | precision: 0.9838709677419355
 98 | recall: 0.305
 99 | f1: 0.46564885496183206
100 | yes_ratio: 0.5
101 | other_num: 0
102 | acc: 0.65
103 | acc_plus: 0.3
104 | #-------artwork-------#
105 | TP: 141
106 | FN: 51
107 | TN: 121
108 | FP: 65
109 | precision: 0.6844660194174758
110 | recall: 0.734375
111 | f1: 0.7085427135678393
112 | yes_ratio: 0.5079365079365079
113 | other_num: 22
114 | acc: 0.655
115 | acc_plus: 0.4
116 | #-------OCR-------#
117 | TP: 19
118 | FN: 1
119 | TN: 8
120 | FP: 12
121 | precision: 0.6129032258064516
122 | recall: 0.95
123 | f1: 0.7450980392156863
124 | yes_ratio: 0.5
125 | other_num: 0
126 | acc: 0.675
127 | acc_plus: 0.4
128 | 
129 | 
130 | Total Score: 1302.8350340136055 
131 | 
132 | existence score: 195.0
133 | count score: 133.33333333333334
134 | position score: 100.0
135 | color score: 160.0
136 | posters score: 129.2517006802721
137 | celebrity score: 115.0
138 | scene score: 162.25
139 | landmark score: 95.0
140 | artwork score: 105.5
141 | OCR score: 107.5
142 | 
143 | 
144 | ================= Cognition =================
145 | #-------commonsense_reasoning-------#
146 | TP: 54
147 | FN: 16
148 | TN: 42
149 | FP: 28
150 | precision: 0.6585365853658537
151 | recall: 0.7714285714285715
152 | f1: 0.7105263157894737
153 | yes_ratio: 0.5
154 | other_num: 0
155 | acc: 0.6857142857142857
156 | acc_plus: 0.44285714285714284
157 | #-------numerical_calculation-------#
158 | TP: 15
159 | FN: 5
160 | TN: 5
161 | FP: 15
162 | precision: 0.5
163 | recall: 0.75
164 | f1: 0.6
165 | yes_ratio: 0.5
166 | other_num: 0
167 | acc: 0.5
168 | acc_plus: 0.15
169 | #-------text_translation-------#
170 | TP: 5
171 | FN: 15
172 | TN: 17
173 | FP: 3
174 | precision: 0.625
175 | recall: 0.25
176 | f1: 0.35714285714285715
177 | yes_ratio: 0.5
178 | other_num: 0
179 | acc: 0.55
180 | acc_plus: 0.2
181 | #-------code_reasoning-------#
182 | TP: 13
183 | FN: 7
184 | TN: 10
185 | FP: 10
186 | precision: 0.5652173913043478
187 | recall: 0.65
188 | f1: 0.6046511627906976
189 | yes_ratio: 0.5
190 | other_num: 0
191 | acc: 0.575
192 | acc_plus: 0.2
193 | 
194 | 
195 | Total Score: 330.3571428571429 
196 | 
197 | commonsense_reasoning score: 112.85714285714286
198 | numerical_calculation score: 65.0
199 | text_translation score: 75.0
200 | code_reasoning score: 77.5
201 | 
202 | 
203 | ==== Overall Metrics Across All Responses ====
204 | TP: 796
205 | FN: 382
206 | TN: 911
207 | FP: 261
208 | precision: 0.7530747398297067
209 | recall: 0.6757215619694398
210 | f1: 0.712304250559284
211 | yes_ratio: 0.5012765957446809
212 | other_num: 24
213 | acc: 0.719039595619208


--------------------------------------------------------------------------------
/evaluation/pope_results.txt:
--------------------------------------------------------------------------------
 1 | Category: adversarial, # samples: 3000
 2 | TP      FP      TN      FN
 3 | 1149    220     1280    351
 4 | Accuracy: 0.8096666666666666
 5 | Precision: 0.8392987582176772
 6 | Recall: 0.766
 7 | F1 score: 0.8009759498082956
 8 | Yes ratio: 0.4563333333333333
 9 | 0.801, 0.810, 0.839, 0.766, 0.456
10 | ====================================
11 | Category: popular, # samples: 3000
12 | TP      FP      TN      FN
13 | 1149    125     1375    351
14 | Accuracy: 0.8413333333333334
15 | Precision: 0.901883830455259
16 | Recall: 0.766
17 | F1 score: 0.8284066330209084
18 | Yes ratio: 0.4246666666666667
19 | 0.828, 0.841, 0.902, 0.766, 0.425
20 | ====================================
21 | Category: random, # samples: 2910
22 | TP      FP      TN      FN
23 | 1149    78      1332    351
24 | Accuracy: 0.8525773195876288
25 | Precision: 0.9364303178484108
26 | Recall: 0.766
27 | F1 score: 0.8426842684268426
28 | Yes ratio: 0.4216494845360825
29 | 0.843, 0.853, 0.936, 0.766, 0.422
30 | ====================================


--------------------------------------------------------------------------------
/evaluation/pope_results_toxicity_finetuned.txt:
--------------------------------------------------------------------------------
 1 | Category: adversarial, # samples: 3000
 2 | TP      FP      TN      FN
 3 | 1177    225     1275    323
 4 | Accuracy: 0.8173333333333334
 5 | Precision: 0.8395149786019972
 6 | Recall: 0.7846666666666666
 7 | F1 score: 0.8111647139903514
 8 | Yes ratio: 0.4673333333333333
 9 | 0.811, 0.817, 0.840, 0.785, 0.467
10 | ====================================
11 | Category: popular, # samples: 3000
12 | TP      FP      TN      FN
13 | 1177    114     1386    323
14 | Accuracy: 0.8543333333333333
15 | Precision: 0.9116963594113091
16 | Recall: 0.7846666666666666
17 | F1 score: 0.8434252955929774
18 | Yes ratio: 0.43033333333333335
19 | 0.843, 0.854, 0.912, 0.785, 0.430
20 | ====================================
21 | Category: random, # samples: 2910
22 | TP      FP      TN      FN
23 | 1177    79      1331    323
24 | Accuracy: 0.8618556701030928
25 | Precision: 0.9371019108280255
26 | Recall: 0.7846666666666666
27 | F1 score: 0.8541364296081279
28 | Yes ratio: 0.4316151202749141
29 | 0.854, 0.862, 0.937, 0.785, 0.432
30 | ====================================


--------------------------------------------------------------------------------
/evaluation/vizwiz_results.txt:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "test": {
 4 |       "overall": 34.92,
 5 |       "other": 34.03,
 6 |       "unanswerable": 30.88,
 7 |       "yes/no": 77.02,
 8 |       "number": 24.63
 9 |     }
10 |   }
11 | ]


--------------------------------------------------------------------------------
/evaluation/vizwiz_results_toxicity_finetuned.txt:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "test": {
 4 |             "overall": 34.98,
 5 |             "other": 33.66,
 6 |             "unanswerable": 32.03,
 7 |             "yes/no": 77.08,
 8 |             "number": 24.72
 9 |         }
10 |     }
11 | ]


--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
3 | except:
4 |     pass
5 | 


--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/llava/eval/claude_plane_test.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/claude_plane_test.jpeg


--------------------------------------------------------------------------------
/llava/eval/claude_plane_test_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/claude_plane_test_2.jpeg


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4o-mini',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             print('rate limit error!')
 31 |             pass
 32 |         except Exception as e:
 33 |             print(e)
 34 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 35 | 
 36 |     print('success!')
 37 |     return response['choices'][0]['message']['content']
 38 | 
 39 | 
 40 | def parse_score(review):
 41 |     try:
 42 |         score_pair = review.split('\n')[0]
 43 |         score_pair = score_pair.replace(',', ' ')
 44 |         sp = score_pair.split(' ')
 45 |         if len(sp) == 2:
 46 |             return [float(sp[0]), float(sp[1])]
 47 |         else:
 48 |             print('error', review)
 49 |             return [-1, -1]
 50 |     except Exception as e:
 51 |         print(e)
 52 |         print('error', review)
 53 |         return [-1, -1]
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 58 |     parser.add_argument('-q', '--question')
 59 |     # parser.add_argument('-a', '--answer')
 60 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 61 |     parser.add_argument('-r', '--rule')
 62 |     parser.add_argument('-o', '--output')
 63 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 64 |     args = parser.parse_args()
 65 | 
 66 |     ray.init()
 67 | 
 68 |     f_q = open(os.path.expanduser(args.question))
 69 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 70 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 71 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 72 | 
 73 |     review_file = open(f'{args.output}', 'w')
 74 | 
 75 |     js_list = []
 76 |     handles = []
 77 |     idx = 0
 78 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 79 |         # if idx == 1:
 80 |         #     break
 81 | 
 82 |         ques = json.loads(ques_js)
 83 |         ans1 = json.loads(ans1_js)
 84 |         ans2 = json.loads(ans2_js)
 85 | 
 86 |         category = json.loads(ques_js)['category']
 87 |         if category in rule_dict:
 88 |             rule = rule_dict[category]
 89 |         else:
 90 |             rule = rule_dict['default']
 91 |         prompt = rule['prompt']
 92 |         role = rule['role']
 93 |         content = (f'[Question]\n{ques["text"]}\n\n'
 94 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 95 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 96 |                    f'[System]\n{prompt}\n\n')
 97 |         js_list.append({
 98 |             'id': idx+1,
 99 |             'question_id': ques['question_id'],
100 |             'answer1_id': ans1['answer_id'],
101 |             'answer2_id': ans2['answer_id'],
102 |             'category': category})
103 |         idx += 1
104 |         handles.append(get_eval.remote(content, args.max_tokens))
105 |         # To avoid the rate limit set by OpenAI
106 |         time.sleep(NUM_SECONDS_TO_SLEEP)
107 | 
108 |     reviews = ray.get(handles)
109 |     for idx, review in enumerate(reviews):
110 |         scores = parse_score(review)
111 |         js_list[idx]['content'] = review
112 |         js_list[idx]['tuple'] = scores
113 |         review_file.write(json.dumps(js_list[idx]) + '\n')
114 |     review_file.close()
115 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 #model='gpt-4o-mini', #gpt-4-0314
 16 |                 model='gpt-4-0613',
 17 |                 messages=[{
 18 |                     'role': 'system',
 19 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 20 |                 }, {
 21 |                     'role': 'user',
 22 |                     'content': content,
 23 |                 }],
 24 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 25 |                 max_tokens=max_tokens,
 26 |             )
 27 |             break
 28 |         except openai.error.RateLimitError:
 29 |             print('rate limit error!')
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     return response['choices'][0]['message']['content']
 36 | 
 37 | 
 38 | def parse_score(review):
 39 |     try:
 40 |         score_pair = review.split('\n')[0]
 41 |         score_pair = score_pair.replace(',', ' ')
 42 |         sp = score_pair.split(' ')
 43 |         if len(sp) == 2:
 44 |             return [float(sp[0]), float(sp[1])]
 45 |         else:
 46 |             print('error', review)
 47 |             print(f"Warning: Unexpected score format. Original format: {sp}")
 48 |             corrected_sp = sp[:2]
 49 |             try:
 50 |                 print(f"Attempting to use: {corrected_sp}")
 51 |                 return [float(corrected_sp[0]), float(corrected_sp[1])]
 52 |             except (ValueError, TypeError):
 53 |                 print(f"Unable to parse scores. Returning default [-1, -1]")
 54 |                 return [-1, -1] 
 55 |     except Exception as e:
 56 |         print(e)
 57 |         print('error', review)
 58 |         return [-1, -1]
 59 | 
 60 | 
 61 | if __name__ == '__main__':
 62 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 63 |     parser.add_argument('-q', '--question')
 64 |     parser.add_argument('-c', '--context')
 65 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 66 |     parser.add_argument('-r', '--rule')
 67 |     parser.add_argument('-o', '--output')
 68 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 69 |     args = parser.parse_args()
 70 | 
 71 |     f_q = open(os.path.expanduser(args.question))
 72 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 73 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 74 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 75 | 
 76 |     if os.path.isfile(os.path.expanduser(args.output)):
 77 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 78 |     else:
 79 |         cur_reviews = []
 80 | 
 81 |     review_file = open(f'{args.output}', 'a')
 82 | 
 83 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 84 |     image_to_context = {context['image']: context for context in context_list}
 85 | 
 86 |     handles = []
 87 |     idx = 0
 88 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 89 |         ques = json.loads(ques_js)
 90 |         ans1 = json.loads(ans1_js)
 91 |         ans2 = json.loads(ans2_js)
 92 | 
 93 |         inst = image_to_context[ques['image']]
 94 | 
 95 |         if isinstance(inst['caption'], list):
 96 |             cap_str = '\n'.join(inst['caption'])
 97 |         else:
 98 |             cap_str = inst['caption']
 99 | 
100 |         category = 'llava_bench_' + json.loads(ques_js)['category']
101 |         if category in rule_dict:
102 |             rule = rule_dict[category]
103 |         else:
104 |             assert False, f"Visual QA category not found in rule file: {category}."
105 |         prompt = rule['prompt']
106 |         role = rule['role']
107 |         content = (f'[Context]\n{cap_str}\n\n'
108 |                    f'[Question]\n{ques["text"]}\n\n'
109 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
110 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
111 |                    f'[System]\n{prompt}\n\n')
112 |         cur_js = {
113 |             'id': idx+1,
114 |             'question_id': ques['question_id'],
115 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
116 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
117 |             'category': category
118 |         }
119 |         if idx >= len(cur_reviews):
120 |             review = get_eval(content, args.max_tokens)
121 |             scores = parse_score(review)
122 |             cur_js['content'] = review
123 |             cur_js['tuple'] = scores
124 |             review_file.write(json.dumps(cur_js) + '\n')
125 |             review_file.flush()
126 |         else:
127 |             print(f'Skipping {idx} as we already have it.')
128 |         idx += 1
129 |         print(idx)
130 |     review_file.close()
131 | 


--------------------------------------------------------------------------------
/llava/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4o-mini',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             print('rate limit error!')
 29 |             pass
 30 |         except Exception as e:
 31 |             print(e)
 32 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 33 | 
 34 |     return response['choices'][0]['message']['content']
 35 | 
 36 | 
 37 | def parse_score(review):
 38 |     try:
 39 |         score_pair = review.split('\n')[0]
 40 |         score_pair = score_pair.replace(',', ' ')
 41 |         sp = score_pair.split(' ')
 42 |         if len(sp) == 2:
 43 |             return [float(sp[0]), float(sp[1])]
 44 |         else:
 45 |             print('error', review)
 46 |             return [-1, -1]
 47 |     except Exception as e:
 48 |         print(e)
 49 |         print('error', review)
 50 |         return [-1, -1]
 51 | 
 52 | 
 53 | if __name__ == '__main__':
 54 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 55 |     parser.add_argument('-q', '--question')
 56 |     parser.add_argument('-c', '--context')
 57 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 58 |     parser.add_argument('-r', '--rule')
 59 |     parser.add_argument('-o', '--output')
 60 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 61 |     args = parser.parse_args()
 62 | 
 63 |     f_q = open(os.path.expanduser(args.question))
 64 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 65 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 66 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 67 | 
 68 |     if os.path.isfile(os.path.expanduser(args.output)):
 69 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 70 |     else:
 71 |         cur_reviews = []
 72 | 
 73 |     review_file = open(f'{args.output}', 'a')
 74 | 
 75 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 76 |     image_to_context = {context['image']: context for context in context_list}
 77 | 
 78 |     handles = []
 79 |     idx = 0
 80 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         inst = image_to_context[ques['image']]
 86 |         cap_str = '\n'.join(inst['captions'])
 87 |         box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 88 | 
 89 |         category = json.loads(ques_js)['category']
 90 |         if category in rule_dict:
 91 |             rule = rule_dict[category]
 92 |         else:
 93 |             assert False, f"Visual QA category not found in rule file: {category}."
 94 |         prompt = rule['prompt']
 95 |         role = rule['role']
 96 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
 97 |                    f'[Question]\n{ques["text"]}\n\n'
 98 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 99 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
100 |                    f'[System]\n{prompt}\n\n')
101 |         cur_js = {
102 |             'id': idx+1,
103 |             'question_id': ques['question_id'],
104 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
105 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
106 |             'category': category
107 |         }
108 |         if idx >= len(cur_reviews):
109 |             review = get_eval(content, args.max_tokens)
110 |             scores = parse_score(review)
111 |             cur_js['content'] = review
112 |             cur_js['tuple'] = scores
113 |             review_file.write(json.dumps(cur_js) + '\n')
114 |             review_file.flush()
115 |         else:
116 |             print(f'Skipping {idx} as we already have it.')
117 |         idx += 1
118 |         print(idx)
119 |     review_file.close()
120 | 


--------------------------------------------------------------------------------
/llava/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/llava/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/llava/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/llava/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/llava/eval/generate_webpage_data_from_table.py:
--------------------------------------------------------------------------------
  1 | """Generate json file for webpage."""
  2 | import json
  3 | import os
  4 | import re
  5 | 
  6 | # models = ['llama', 'alpaca', 'gpt35', 'bard']
  7 | models = ['vicuna']
  8 | 
  9 | 
 10 | def read_jsonl(path: str, key: str=None):
 11 |     data = []
 12 |     with open(os.path.expanduser(path)) as f:
 13 |         for line in f:
 14 |             if not line:
 15 |                 continue
 16 |             data.append(json.loads(line))
 17 |     if key is not None:
 18 |         data.sort(key=lambda x: x[key])
 19 |         data = {item[key]: item for item in data}
 20 |     return data
 21 | 
 22 | 
 23 | def trim_hanging_lines(s: str, n: int) -> str:
 24 |     s = s.strip()
 25 |     for _ in range(n):
 26 |         s = s.split('\n', 1)[1].strip()
 27 |     return s
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 |     questions = read_jsonl('table/question.jsonl', key='question_id')
 32 | 
 33 |     # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
 34 |     # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
 35 |     # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
 36 |     # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
 37 |     vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
 38 |     ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
 39 | 
 40 |     review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
 41 |     # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
 42 |     # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
 43 |     # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
 44 |     # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
 45 | 
 46 |     records = []
 47 |     for qid in questions.keys():
 48 |         r = {
 49 |             'id': qid,
 50 |             'category': questions[qid]['category'],
 51 |             'question': questions[qid]['text'],
 52 |             'answers': {
 53 |                 # 'alpaca': alpaca_answers[qid]['text'],
 54 |                 # 'llama': llama_answers[qid]['text'],
 55 |                 # 'bard': bard_answers[qid]['text'],
 56 |                 # 'gpt35': gpt35_answers[qid]['text'],
 57 |                 'vicuna': vicuna_answers[qid]['text'],
 58 |                 'ours': ours_answers[qid]['text'],
 59 |             },
 60 |             'evaluations': {
 61 |                 # 'alpaca': review_alpaca[qid]['text'],
 62 |                 # 'llama': review_llama[qid]['text'],
 63 |                 # 'bard': review_bard[qid]['text'],
 64 |                 'vicuna': review_vicuna[qid]['content'],
 65 |                 # 'gpt35': review_gpt35[qid]['text'],
 66 |             },
 67 |             'scores': {
 68 |                 'vicuna': review_vicuna[qid]['tuple'],
 69 |                 # 'alpaca': review_alpaca[qid]['score'],
 70 |                 # 'llama': review_llama[qid]['score'],
 71 |                 # 'bard': review_bard[qid]['score'],
 72 |                 # 'gpt35': review_gpt35[qid]['score'],
 73 |             },
 74 |         }
 75 | 
 76 |         # cleanup data
 77 |         cleaned_evals = {}
 78 |         for k, v in r['evaluations'].items():
 79 |             v = v.strip()
 80 |             lines = v.split('\n')
 81 |             # trim the first line if it's a pair of numbers
 82 |             if re.match(r'\d+[, ]+\d+', lines[0]):
 83 |                 lines = lines[1:]
 84 |             v = '\n'.join(lines)
 85 |             cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
 86 | 
 87 |         r['evaluations'] = cleaned_evals
 88 |         records.append(r)
 89 | 
 90 |     # Reorder the records, this is optional
 91 |     for r in records:
 92 |         if r['id'] <= 20:
 93 |             r['id'] += 60
 94 |         else:
 95 |             r['id'] -= 20
 96 |     for r in records:
 97 |         if r['id'] <= 50:
 98 |             r['id'] += 10
 99 |         elif 50 < r['id'] <= 60:
100 |             r['id'] -= 50
101 |     for r in records:
102 |         if r['id'] == 7:
103 |             r['id'] = 1
104 |         elif r['id'] < 7:
105 |             r['id'] += 1 
106 | 
107 |     records.sort(key=lambda x: x['id'])
108 | 
109 |     # Write to file
110 |     with open('webpage/data.json', 'w') as f:
111 |         json.dump({'questions': records, 'models': models}, f, indent=2)
112 | 


--------------------------------------------------------------------------------
/llava/eval/maya/convert_answer_to_mme.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | from collections import defaultdict
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 | 
 9 |     parser.add_argument('--experiment',
10 |                         type=str,
11 |                         required=True)
12 | 
13 |     args = parser.parse_args()
14 |     return args
15 | 
16 | def get_gt(data_path):
17 |     GT = {}
18 | 
19 |     for category in os.listdir(data_path):
20 |         category_dir = os.path.join(data_path, category)
21 |         if not os.path.isdir(category_dir):
22 |             continue
23 |         if os.path.exists(os.path.join(category_dir, 'images')):
24 |             image_path = os.path.join(category_dir, 'images')
25 |         else:
26 |             image_path = category_dir
27 |         qa_path = os.path.join(category_dir, 'questions_answers_YN')
28 |         assert os.path.isdir(image_path), image_path
29 |         assert os.path.isdir(qa_path), qa_path
30 |         for file in os.listdir(qa_path):
31 |             if not file.endswith('.txt'):
32 |                 continue
33 |             for line in open(os.path.join(qa_path, file)):
34 |                 question, answer = line.strip().split('\t')
35 |                 GT[(category, file, question)] = answer
36 |     return GT
37 | 
38 | if __name__ == "__main__":
39 | 
40 |     args = get_args()
41 | 
42 |     GT = get_gt(
43 |         data_path='MME_Benchmark_release_version'
44 |     )
45 | 
46 |     experiment = args.experiment
47 | 
48 |     result_dir = os.path.join('eval_tool', 'answers', experiment)
49 |     os.makedirs(result_dir, exist_ok=True)
50 | 
51 |     answers = [json.loads(line) for line in open(os.path.join('answers', f'{experiment}.jsonl'))]
52 | 
53 |     results = defaultdict(list)
54 |     for answer in answers:
55 |         category = answer['question_id'].split('/')[0]
56 |         file = answer['question_id'].split('/')[-1].split('.')[0] + '.txt'
57 |         question = answer['prompt']
58 |         results[category].append((file, answer['prompt'], answer['text']))
59 | 
60 |     unmatched_prompts = []
61 |     for category, cate_tups in results.items():
62 |         with open(os.path.join(result_dir, f'{category}.txt'), 'w') as fp:
63 |             for file, prompt, answer in cate_tups:
64 |                 original_prompt = prompt  # Keep original prompt for logging
65 |                 if 'Answer the question using a single word or phrase.' in prompt:
66 |                     prompt = prompt.replace('Answer the question using a single word or phrase.', '').strip()
67 |                 if 'Please answer yes or no.' not in prompt:
68 |                     prompt = prompt + ' Please answer yes or no.'
69 |                     if (category, file, prompt) not in GT:
70 |                         prompt = prompt.replace(' Please answer yes or no.', '  Please answer yes or no.')
71 |                 try:
72 |                     gt_ans = GT[(category, file, prompt)]
73 |                 except KeyError:
74 |                     # print(f"Ground truth not found for Category: {category}, File: {file}, Prompt: {prompt}")
75 |                     gt_ans = "Yes"  # Defaults to yes
76 |                     unmatched_prompts.append((category, file, prompt))
77 |                 tup = file, prompt, gt_ans, answer
78 |                 fp.write('\t'.join(tup) + '\n')
79 | 
80 |     if unmatched_prompts:
81 |         print(f"WARNING: {len(unmatched_prompts)} ground truths were missing during eval and defaulted to 'Yes'. This will lead to inaccurate eval.")
82 |         print("-------------Summary of Missing Prompts-------------")
83 |         for category, file, prompt in unmatched_prompts:
84 |             print(f"Category: {category}, File: {file}, Prompt: {prompt}")
85 |         print('\n\n')
86 | 


--------------------------------------------------------------------------------
/llava/eval/maya/load_mme.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """
  4 | This script loads the MME dataset from Hugging Face, saves images and Y/N answers to the specified folder structure.
  5 | """
  6 | 
  7 | import os
  8 | import json
  9 | import pandas as pd
 10 | from datasets import load_dataset
 11 | from PIL import Image
 12 | 
 13 | def main():
 14 |     # Define paths
 15 |     questions_file = "./playground/data/eval/MME/llava_mme.jsonl"
 16 |     answers_file = "./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl"
 17 |     images_folder = "./playground/data/eval/MME/MME_Benchmark_release_version"
 18 |     
 19 |     # Load the MME dataset from disk
 20 |     mme_dataset = load_dataset("lmms-lab/MME")
 21 |     print("Dataset loaded successfully.\n")
 22 |     
 23 |     # Ensure base image save directory exists
 24 |     image_save_base_path = "./playground/data/eval/MME/MME_Benchmark_release_version"
 25 |     os.makedirs(image_save_base_path, exist_ok=True)
 26 | 
 27 |     # Categories that require an extra subdirectory for Y/N answers and images
 28 |     special_categories = ["artwork", "celebrity", "landmark", "scene", "posters"]
 29 |     
 30 |     # Save Y/N Answers to Folder
 31 |     def save_answer(example):
 32 |         question_id = example['question_id']  # Format: category/image.png
 33 |         question = example['question']
 34 |         answer = example['answer']
 35 |         
 36 |         category = question_id.split('/')[0]
 37 |         image_name = os.path.basename(question_id)
 38 |         image_basename = os.path.splitext(image_name)[0]
 39 |         
 40 |         # Special case handling – one entry on HuggingFace is different than the LLaVA repo
 41 |         if category == 'code_reasoning' and image_basename == '0020':
 42 |             question = "Is a python code shown in the picture? Please answer yes or no."
 43 |         
 44 |         # Determine the save directory
 45 |         if category in special_categories:
 46 |             full_save_dir = os.path.join(image_save_base_path, category, "questions_answers_YN")
 47 |         else:
 48 |             full_save_dir = os.path.join(image_save_base_path, category, "questions_answers_YN")
 49 |         
 50 |         # Create the directory if it doesn't exist
 51 |         os.makedirs(full_save_dir, exist_ok=True)
 52 |         
 53 |         # Define the full path for the answer file
 54 |         answer_save_path = os.path.join(full_save_dir, f"{image_basename}.txt")
 55 |         
 56 |         # Append the question and answer to the text file
 57 |         with open(answer_save_path, 'a') as answer_file:
 58 |             answer_file.write(f"{question}\t{answer}\n")
 59 |         
 60 |         return {'answer_save_path': answer_save_path}
 61 |     
 62 |     # Save Images to Folder
 63 |     def save_images(example):
 64 |         image = example['image']  # Assuming 'image' is a PIL Image object
 65 |         image_subdir = example['question_id']  # category/image.png
 66 |         category = image_subdir.split('/')[0]  # extract the category (first part of question_id)
 67 |         file_name = os.path.basename(image_subdir)  # extract the file name (including extension)
 68 | 
 69 |         # Get the image extension (e.g., ".png", ".jpg")
 70 |         _, extension = os.path.splitext(file_name)
 71 | 
 72 |         # Check if the category is one of the special ones that needs an extra 'images' folder
 73 |         if category in special_categories:
 74 |             # Save in the category/images/ structure (e.g., artwork/images/16006.jpg)
 75 |             full_save_dir = os.path.join(image_save_base_path, category, "images")
 76 |         else:
 77 |             # Save in the usual category/image.png structure (e.g., code_reasoning/0012.png)
 78 |             full_save_dir = os.path.join(image_save_base_path, category)
 79 | 
 80 |         # Create the subdirectory if it doesn't exist
 81 |         os.makedirs(full_save_dir, exist_ok=True)
 82 | 
 83 |         # Create the full image save path
 84 |         image_save_path = os.path.join(full_save_dir, file_name)
 85 | 
 86 |         # Save the image to the specified path in the correct format
 87 |         if extension.lower() == ".jpg" or extension.lower() == ".jpeg":
 88 |             image.save(image_save_path, format="JPEG")
 89 |         elif extension.lower() == ".png":
 90 |             image.save(image_save_path, format="PNG")
 91 |         else:
 92 |             # Handle other formats or default to PNG (though this shouldn't happen!)
 93 |             image.save(image_save_path, format="PNG")
 94 | 
 95 |         return {'image_save_path': image_save_path}
 96 |     
 97 |     # Apply the save_answer and save_images functions to the 'test' split of the dataset
 98 |     print("Saving Y/N answers to disk...")
 99 |     _ = mme_dataset['test'].map(save_answer)
100 |     print("Answers sucessfully saved.")
101 |     print("Saving images to disk. This might slow down around the 2000 image mark; be patient, it will speed up ")
102 |     _ = mme_dataset['test'].map(save_images)
103 |     print("Images sucessfully saved.")
104 |     
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/llava/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
 3 | import torch
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | import shortuuid
 8 | 
 9 | from llava.conversation import default_conversation
10 | from llava.utils import disable_torch_init
11 | 
12 | 
13 | @torch.inference_mode()
14 | def eval_model(model_name, questions_file, answers_file):
15 |     # Model
16 |     disable_torch_init()
17 |     model_name = os.path.expanduser(model_name)
18 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19 |     model = AutoModelForCausalLM.from_pretrained(model_name,
20 |         torch_dtype=torch.float16).cuda()
21 | 
22 | 
23 |     ques_file = open(os.path.expanduser(questions_file), "r")
24 |     ans_file = open(os.path.expanduser(answers_file), "w")
25 |     for i, line in enumerate(tqdm(ques_file)):
26 |         idx = json.loads(line)["question_id"]
27 |         qs = json.loads(line)["text"]
28 |         cat = json.loads(line)["category"]
29 |         conv = default_conversation.copy()
30 |         conv.append_message(conv.roles[0], qs)
31 |         prompt = conv.get_prompt()
32 |         inputs = tokenizer([prompt])
33 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
34 |         output_ids = model.generate(
35 |             input_ids,
36 |             do_sample=True,
37 |             use_cache=True,
38 |             temperature=0.7,
39 |             max_new_tokens=1024,)
40 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41 |         try:
42 |             index = outputs.index(conv.sep, len(prompt))
43 |         except ValueError:
44 |             outputs += conv.sep
45 |             index = outputs.index(conv.sep, len(prompt))
46 | 
47 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48 |         ans_id = shortuuid.uuid()
49 |         ans_file.write(json.dumps({"question_id": idx,
50 |                                    "text": outputs,
51 |                                    "answer_id": ans_id,
52 |                                    "model_id": model_name,
53 |                                    "metadata": {}}) + "\n")
54 |         ans_file.flush()
55 |     ans_file.close()
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62 |     args = parser.parse_args()
63 | 
64 |     eval_model(args.model_name, args.question_file, args.answers_file)
65 | 


--------------------------------------------------------------------------------
/llava/eval/model_vqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import os
  4 | import json
  5 | from tqdm import tqdm
  6 | import shortuuid
  7 | 
  8 | from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  9 | from llava.conversation import conv_templates, SeparatorStyle
 10 | from llava.model.builder import load_pretrained_model
 11 | from llava.utils import disable_torch_init
 12 | from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
 13 | from llava.eval.maya.eval_utils import load_maya_model
 14 | 
 15 | from PIL import Image
 16 | import math
 17 | 
 18 | 
 19 | def split_list(lst, n):
 20 |     """Split a list into n (roughly) equal-sized chunks"""
 21 |     chunk_size = math.ceil(len(lst) / n)  # integer division
 22 |     return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
 23 | 
 24 | 
 25 | def get_chunk(lst, n, k):
 26 |     chunks = split_list(lst, n)
 27 |     return chunks[k]
 28 | 
 29 | 
 30 | def eval_model(args):
 31 |     # Model
 32 |     disable_torch_init()
 33 |     model_path = os.path.expanduser(args.model_path)
 34 |     model_name = get_model_name_from_path(model_path)
 35 |     if 'maya' not in model_name:
 36 |         tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
 37 |     else:
 38 |         model, tokenizer, image_processor, context_len = load_maya_model(args.model_base, model_path, mode = args.mode)
 39 | 
 40 |     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
 41 |     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
 42 |     answers_file = os.path.expanduser(args.answers_file)
 43 |     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
 44 |     ans_file = open(answers_file, "w")
 45 |     for line in tqdm(questions):
 46 |         idx = line["question_id"]
 47 |         image_file = line["image"]
 48 |         qs = line["text"]
 49 |         cur_prompt = qs
 50 |         if model.config.mm_use_im_start_end:
 51 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 52 |         else:
 53 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 54 | 
 55 |         conv = conv_templates[args.conv_mode].copy()
 56 |         conv.append_message(conv.roles[0], qs)
 57 |         conv.append_message(conv.roles[1], None)
 58 |         prompt = conv.get_prompt()
 59 | 
 60 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 61 | 
 62 |         image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
 63 |         image_tensor = process_images([image], image_processor, model.config)[0]
 64 | 
 65 |         with torch.inference_mode():
 66 |             output_ids = model.generate(
 67 |                 input_ids,
 68 |                 images=image_tensor.unsqueeze(0).half().cuda(),
 69 |                 image_sizes=[image.size],
 70 |                 do_sample=True if args.temperature > 0 else False,
 71 |                 temperature=args.temperature,
 72 |                 top_p=args.top_p,
 73 |                 num_beams=args.num_beams,
 74 |                 # no_repeat_ngram_size=3,
 75 |                 max_new_tokens=1024,
 76 |                 use_cache=True)
 77 | 
 78 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
 79 | 
 80 |         ans_id = shortuuid.uuid()
 81 |         ans_file.write(json.dumps({"question_id": idx,
 82 |                                    "prompt": cur_prompt,
 83 |                                    "text": outputs,
 84 |                                    "answer_id": ans_id,
 85 |                                    "model_id": model_name,
 86 |                                    "metadata": {}}) + "\n")
 87 |         ans_file.flush()
 88 |     ans_file.close()
 89 | 
 90 | if __name__ == "__main__":
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument("--model-path", type=str, default="nahidalam/maya_full_ft")
 93 |     parser.add_argument("--model-base", type=str, default="CohereForAI/aya-23-8B")
 94 |     parser.add_argument("--mode", type=str, default="finetuned")
 95 |     parser.add_argument("--image-folder", type=str, default="")
 96 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
 97 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
 98 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
 99 |     parser.add_argument("--num-chunks", type=int, default=1)
100 |     parser.add_argument("--chunk-idx", type=int, default=0)
101 |     parser.add_argument("--temperature", type=float, default=0.2)
102 |     parser.add_argument("--top_p", type=float, default=None)
103 |     parser.add_argument("--num_beams", type=int, default=1)
104 |     args = parser.parse_args()
105 | 
106 |     eval_model(args)
107 | 


--------------------------------------------------------------------------------
/llava/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | import concurrent.futures
 8 | 
 9 | import openai
10 | import tqdm
11 | import shortuuid
12 | 
13 | MODEL = 'gpt-3.5-turbo'
14 | MODEL_ID = 'gpt-3.5-turbo:20230327'
15 | 
16 | def get_answer(question_id: int, question: str, max_tokens: int):
17 |     ans = {
18 |         'answer_id': shortuuid.uuid(),
19 |         'question_id': question_id,
20 |         'model_id': MODEL_ID,
21 |     }
22 |     for _ in range(3):
23 |         try:
24 |             response = openai.ChatCompletion.create(
25 |                 model=MODEL,
26 |                 messages=[{
27 |                     'role': 'system',
28 |                     'content': 'You are a helpful assistant.'
29 |                 }, {
30 |                     'role': 'user',
31 |                     'content': question,
32 |                 }],
33 |                 max_tokens=max_tokens,
34 |             )
35 |             ans['text'] = response['choices'][0]['message']['content']
36 |             return ans
37 |         except Exception as e:
38 |             print('[ERROR]', e)
39 |             ans['text'] = '#ERROR#'
40 |             time.sleep(1)
41 |     return ans
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46 |     parser.add_argument('-q', '--question')
47 |     parser.add_argument('-o', '--output')
48 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49 |     args = parser.parse_args()
50 | 
51 |     questions_dict = {}
52 |     with open(os.path.expanduser(args.question)) as f:
53 |         for line in f:
54 |             if not line:
55 |                 continue
56 |             q = json.loads(line)
57 |             questions_dict[q['question_id']] = q['text']
58 | 
59 |     answers = []
60 | 
61 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62 |         futures = []
63 |         for qid, question in questions_dict.items():
64 |             future = executor.submit(get_answer, qid, question, args.max_tokens)
65 |             futures.append(future)
66 | 
67 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68 |             answers.append(future.result())
69 | 
70 |     answers.sort(key=lambda x: x['question_id'])
71 | 
72 |     with open(os.path.expanduser(args.output), 'w') as f:
73 |         table = [json.dumps(ans) for ans in answers]
74 |         f.write('\n'.join(table))
75 | 


--------------------------------------------------------------------------------
/llava/eval/run_llava.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | 
  4 | from llava.constants import (
  5 |     IMAGE_TOKEN_INDEX,
  6 |     DEFAULT_IMAGE_TOKEN,
  7 |     DEFAULT_IM_START_TOKEN,
  8 |     DEFAULT_IM_END_TOKEN,
  9 |     IMAGE_PLACEHOLDER,
 10 | )
 11 | from llava.conversation import conv_templates, SeparatorStyle
 12 | from llava.model.builder import load_pretrained_model
 13 | from llava.utils import disable_torch_init
 14 | from llava.mm_utils import (
 15 |     process_images,
 16 |     tokenizer_image_token,
 17 |     get_model_name_from_path,
 18 | )
 19 | 
 20 | from PIL import Image
 21 | 
 22 | import requests
 23 | from PIL import Image
 24 | from io import BytesIO
 25 | import re
 26 | 
 27 | 
 28 | def image_parser(args):
 29 |     out = args.image_file.split(args.sep)
 30 |     return out
 31 | 
 32 | 
 33 | def load_image(image_file):
 34 |     if image_file.startswith("http") or image_file.startswith("https"):
 35 |         response = requests.get(image_file)
 36 |         image = Image.open(BytesIO(response.content)).convert("RGB")
 37 |     else:
 38 |         image = Image.open(image_file).convert("RGB")
 39 |     return image
 40 | 
 41 | 
 42 | def load_images(image_files):
 43 |     out = []
 44 |     for image_file in image_files:
 45 |         image = load_image(image_file)
 46 |         out.append(image)
 47 |     return out
 48 | 
 49 | 
 50 | def eval_model(args):
 51 |     # Model
 52 |     disable_torch_init()
 53 | 
 54 |     model_name = get_model_name_from_path(args.model_path)
 55 |     tokenizer, model, image_processor, context_len = load_pretrained_model(
 56 |         args.model_path, args.model_base, model_name
 57 |     )
 58 | 
 59 |     qs = args.query
 60 |     image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
 61 |     if IMAGE_PLACEHOLDER in qs:
 62 |         if model.config.mm_use_im_start_end:
 63 |             qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
 64 |         else:
 65 |             qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
 66 |     else:
 67 |         if model.config.mm_use_im_start_end:
 68 |             qs = image_token_se + "\n" + qs
 69 |         else:
 70 |             qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 71 | 
 72 |     if "llama-2" in model_name.lower():
 73 |         conv_mode = "llava_llama_2"
 74 |     elif "mistral" in model_name.lower():
 75 |         conv_mode = "mistral_instruct"
 76 |     elif "v1.6-34b" in model_name.lower():
 77 |         conv_mode = "chatml_direct"
 78 |     elif "v1" in model_name.lower():
 79 |         conv_mode = "llava_v1"
 80 |     elif "mpt" in model_name.lower():
 81 |         conv_mode = "mpt"
 82 |     else:
 83 |         conv_mode = "llava_v0"
 84 | 
 85 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 86 |         print(
 87 |             "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
 88 |                 conv_mode, args.conv_mode, args.conv_mode
 89 |             )
 90 |         )
 91 |     else:
 92 |         args.conv_mode = conv_mode
 93 | 
 94 |     conv = conv_templates[args.conv_mode].copy()
 95 |     conv.append_message(conv.roles[0], qs)
 96 |     conv.append_message(conv.roles[1], None)
 97 |     prompt = conv.get_prompt()
 98 | 
 99 |     image_files = image_parser(args)
100 |     images = load_images(image_files)
101 |     image_sizes = [x.size for x in images]
102 |     images_tensor = process_images(
103 |         images,
104 |         image_processor,
105 |         model.config
106 |     ).to(model.device, dtype=torch.float16)
107 | 
108 |     input_ids = (
109 |         tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
110 |         .unsqueeze(0)
111 |         .cuda()
112 |     )
113 | 
114 |     with torch.inference_mode():
115 |         output_ids = model.generate(
116 |             input_ids,
117 |             images=images_tensor,
118 |             image_sizes=image_sizes,
119 |             do_sample=True if args.temperature > 0 else False,
120 |             temperature=args.temperature,
121 |             top_p=args.top_p,
122 |             num_beams=args.num_beams,
123 |             max_new_tokens=args.max_new_tokens,
124 |             use_cache=True,
125 |         )
126 | 
127 |     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
128 |     print(outputs)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     parser = argparse.ArgumentParser()
133 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
134 |     parser.add_argument("--model-base", type=str, default=None)
135 |     parser.add_argument("--image-file", type=str, required=True)
136 |     parser.add_argument("--query", type=str, required=True)
137 |     parser.add_argument("--conv-mode", type=str, default=None)
138 |     parser.add_argument("--sep", type=str, default=",")
139 |     parser.add_argument("--temperature", type=float, default=0.2)
140 |     parser.add_argument("--top_p", type=float, default=None)
141 |     parser.add_argument("--num_beams", type=int, default=1)
142 |     parser.add_argument("--max_new_tokens", type=int, default=512)
143 |     args = parser.parse_args()
144 | 
145 |     eval_model(args)
146 | 


--------------------------------------------------------------------------------
/llava/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/llava/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/llava/eval/table/prompt.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for general questions"}
2 | {"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, "description": "Prompt for coding questions"}
3 | {"prompt_id": 3, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, "description": "Prompt for math questions"}
4 | {"prompt_id": 4, "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer.", "prompt_template": "[Visual Context]\n{context}\n[Question]\n{question}\n\n[Assistant 1]\n{answer_1}\n\n[End of Assistant 1]\n\n[Assistant 2]\n{answer_2}\n\n[End of Assistant 2]\n\n[System]\n{prompt}\n\n", "defaults": {"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, "description": "Prompt for visual questions"}
5 | 


--------------------------------------------------------------------------------
/llava/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/llava/eval/webpage/styles.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  3 |     background-color: #f8f9fa;
  4 | }
  5 | 
  6 | .navbar-dark .navbar-nav .nav-link {
  7 |     color: #f1cf68;
  8 |     font-size: 1.1rem;
  9 |     padding: 0.5rem 0.6rem;
 10 | }
 11 | 
 12 | .card-header {
 13 |     font-weight: bold;
 14 | }
 15 | 
 16 | .card {
 17 |     box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 18 |     transition: 0.3s;
 19 | }
 20 | 
 21 | .card:hover {
 22 |     box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
 23 | }
 24 | 
 25 | button {
 26 |     transition: background-color 0.3s;
 27 | }
 28 | 
 29 | button:hover {
 30 |     background-color: #007bff;
 31 | }
 32 | 
 33 | @media (max-width: 767px) {
 34 |     .form-row .form-group {
 35 |         margin-bottom: 10px;
 36 |     }
 37 | }
 38 | 
 39 | /* Extra styles */
 40 | 
 41 | .expandable-card .card-text-container {
 42 |     max-height: 200px;
 43 |     overflow-y: hidden;
 44 |     position: relative;
 45 | }
 46 | 
 47 | .expandable-card.expanded .card-text-container {
 48 |     max-height: none;
 49 | }
 50 | 
 51 | .expand-btn {
 52 |     position: relative;
 53 |     display: none;
 54 |     background-color: rgba(255, 255, 255, 0.8);
 55 |     color: #510c75;
 56 |     border-color: transparent;
 57 | }
 58 | 
 59 | .expand-btn:hover {
 60 |     background-color: rgba(200, 200, 200, 0.8);
 61 |     text-decoration: none;
 62 |     border-color: transparent;
 63 |     color: #510c75;
 64 | }
 65 | 
 66 | .expand-btn:focus {
 67 |     outline: none;
 68 |     text-decoration: none;
 69 | }
 70 | 
 71 | .expandable-card:not(.expanded) .card-text-container:after {
 72 |     content: "";
 73 |     position: absolute;
 74 |     bottom: 0;
 75 |     left: 0;
 76 |     width: 100%;
 77 |     height: 90px;
 78 |     background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
 79 | }
 80 | 
 81 | .expandable-card:not(.expanded) .expand-btn {
 82 |     margin-top: -40px;
 83 | }
 84 | 
 85 | .card-body {
 86 |     padding-bottom: 5px;
 87 | }
 88 | 
 89 | .vertical-flex-layout {
 90 |     justify-content: center;
 91 |     align-items: center;
 92 |     height: 100%;
 93 |     display: flex;
 94 |     flex-direction: column;
 95 |     gap: 5px;
 96 | }
 97 | 
 98 | .figure-img {
 99 |     max-width: 100%;
100 |     height: auto;
101 | }
102 | 
103 | .adjustable-font-size {
104 |     font-size: calc(0.5rem + 2vw);
105 | }
106 | 


--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 |     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 |     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 |     from .language_model.llava_cohere import LlavaCohereForCausalLM, LlavaCohereConfig
6 | except:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/llava/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading target model")
19 |     auto_upgrade(target_model_path)
20 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21 | 
22 |     print("Calculating delta")
23 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data -= base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31 |             bparam = base.state_dict()[name]
32 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33 | 
34 |     print("Saving delta")
35 |     if hub_repo_id:
36 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37 |     else:
38 |         kwargs = {}
39 |     target.save_pretrained(delta_path, **kwargs)
40 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument("--base-model-path", type=str, required=True)
47 |     parser.add_argument("--target-model-path", type=str, required=True)
48 |     parser.add_argument("--delta-path", type=str, required=True)
49 |     parser.add_argument("--hub-repo-id", type=str, default=None)
50 |     args = parser.parse_args()
51 | 
52 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
53 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | from .siglip_encoder import SiglipVisionTower
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10 |         if use_s2:
11 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12 |         else:
13 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14 |     elif 'siglip' in vision_tower:
15 |         return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
17 | 


--------------------------------------------------------------------------------
/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/llava/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | import warnings
  3 | 
  4 | import torch
  5 | 
  6 | import transformers
  7 | from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
  8 | 
  9 | try:
 10 |     from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 11 | except ImportError:
 12 |     from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
 13 | from flash_attn.bert_padding import unpad_input, pad_input
 14 | 
 15 | 
 16 | def forward(
 17 |     self,
 18 |     hidden_states: torch.Tensor,
 19 |     attention_mask: Optional[torch.Tensor] = None,
 20 |     position_ids: Optional[torch.Tensor] = None,
 21 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 22 |     output_attentions: bool = False,
 23 |     use_cache: bool = False,
 24 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 25 |     if output_attentions:
 26 |         warnings.warn(
 27 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 28 |         )
 29 | 
 30 |     bsz, q_len, _ = hidden_states.size()
 31 | 
 32 |     query_states = (
 33 |         self.q_proj(hidden_states)
 34 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 35 |         .transpose(1, 2)
 36 |     )
 37 |     key_states = (
 38 |         self.k_proj(hidden_states)
 39 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 40 |         .transpose(1, 2)
 41 |     )
 42 |     value_states = (
 43 |         self.v_proj(hidden_states)
 44 |         .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
 45 |         .transpose(1, 2)
 46 |     )  # shape: (b, num_heads, s, head_dim)
 47 | 
 48 |     kv_seq_len = key_states.shape[-2]
 49 |     if past_key_value is not None:
 50 |         kv_seq_len += past_key_value[0].shape[-2]
 51 | 
 52 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 53 |     query_states, key_states = apply_rotary_pos_emb(
 54 |         query_states, key_states, cos, sin, position_ids
 55 |     )
 56 | 
 57 |     if past_key_value is not None:
 58 |         # reuse k, v
 59 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 60 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 61 | 
 62 |     past_key_value = (key_states, value_states) if use_cache else None
 63 | 
 64 |     # repeat k/v heads if n_kv_heads < n_heads
 65 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
 66 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
 67 | 
 68 |     # Transform the data into the format required by flash attention
 69 |     qkv = torch.stack([query_states, key_states, value_states], dim=2)
 70 |     qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
 71 |     key_padding_mask = attention_mask
 72 | 
 73 |     if key_padding_mask is None:
 74 |         qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
 75 |         cu_q_lens = torch.arange(
 76 |             0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
 77 |         )
 78 |         max_s = q_len
 79 |         output = flash_attn_unpadded_qkvpacked_func(
 80 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 81 |         )
 82 |         output = output.view(bsz, q_len, -1)
 83 |     else:
 84 |         qkv = qkv.reshape(bsz, q_len, -1)
 85 |         qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
 86 |         qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
 87 |         output_unpad = flash_attn_unpadded_qkvpacked_func(
 88 |             qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
 89 |         )
 90 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 91 |         output = pad_input(output_unpad, indices, bsz, q_len)
 92 | 
 93 |     return self.o_proj(output), None, past_key_value
 94 | 
 95 | 
 96 | # Disable the transformation of the attention mask in LlamaModel as the flash attention
 97 | # requires the attention mask to be the same as the key_padding_mask
 98 | def _prepare_decoder_attention_mask(
 99 |     self, attention_mask, input_shape, inputs_embeds, past_key_values_length
100 | ):
101 |     # [bsz, seq_len]
102 |     return attention_mask
103 | 
104 | 
105 | def replace_llama_attn_with_flash_attn():
106 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
107 |     if cuda_major < 8:
108 |         warnings.warn(
109 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
110 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
111 |         )
112 |     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
113 |         _prepare_decoder_attention_mask
114 |     )
115 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
116 | 


--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/llava/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from llava.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True, encoding='UTF-8')
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/playground/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nahidalam/maya/4a24bb958e217bf7ceb92e9c0b6373f88117e59b/playground/__init__.py


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A man wearing multiple neck ties making a goofy face.
 2 | A man in a white shirt wearing very many ties.
 3 | a man with ties on poses for a picture
 4 | A man wearing multiple ties on his neck.
 5 | A young man smiles while wearing several ties.
 6 | 
 7 | tie: [0.574, 0.298, 0.752, 0.704]
 8 | tie: [0.464, 0.339, 0.639, 0.789]
 9 | tie: [0.349, 0.363, 0.563, 0.732]
10 | tie: [0.259, 0.255, 0.668, 0.805]
11 | person: [0.019, 0.065, 0.962, 0.988]
12 | person: [0.0, 0.24, 0.214, 1.0]
13 | tie: [0.316, 0.778, 0.443, 0.867]
14 | tie: [0.386, 0.707, 0.496, 0.801]
15 | tie: [0.251, 0.354, 0.402, 0.747]
16 | tie: [0.44, 0.318, 0.63, 0.397]
17 | tie: [0.566, 0.33, 0.655, 0.543]
18 | tie: [0.25, 0.409, 0.359, 0.556]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/000_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What is unusual about this photo?
3 | ===
4 | Answer:
5 | In the photo, the man is wearing a total of ten ties around his neck. This is considered unusual because it is not common for someone to wear multiple ties at once. In a typical setting, a person would wear only one tie. The man's goofy facial expression also adds to the unconventional and humorous nature of the image.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/001_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges do these people face?
3 | ===
4 | Answer:
5 | In the image, a group of people is standing outside a black SUV in a parking area, surrounded by various pieces of luggage, including suitcases and backpacks. They are facing the challenge of fitting all their luggage into the black SUV. There are multiple suitcases and backpacks to be packed, which suggests that the group has a significant amount of belongings to accommodate. They might have to strategize and arrange the luggage efficiently to ensure that everything fits properly into the vehicle. Additionally, they need to consider the comfort of the passengers and visibility while driving, so the placement of the luggage must not obstruct the driver's view or make the passengers uncomfortable during the trip.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/002_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.
6 | 
7 | fire hydrant: [0.326, 0.612, 0.426, 0.72]


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/002_conv.txt:
--------------------------------------------------------------------------------
1 | Question:
2 | What challenges might this city face?
3 | ===
4 | Answer:
5 | The city faces challenges due to the harsh winter conditions and heavy snowfall. In the image, a red fire hydrant is almost buried deep in the snow, which indicates the significant amount of snow the city has experienced. This can lead to various challenges such as difficulties in transportation, increased risk of accidents, and disruptions to daily life. For example, the recently plowed sidewalk near the fire hydrant shows that the city has to constantly clear snow from roads and sidewalks to maintain access and safety for pedestrians and vehicles. Moreover, emergency services, like firefighters, might face challenges accessing crucial equipment, such as fire hydrants, during emergencies due to the snow accumulation. This highlights the importance of effective snow management strategies and preparedness in such cities to minimize the impact of harsh winter conditions on residents and essential services.


--------------------------------------------------------------------------------
/playground/data/prompts/complex_reasoning/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
 2 | 
 3 | The task is to use the provided caption and bounding box information, create a plausible question about the image, and provide the answer in detail.
 4 | 
 5 | Create complex questions beyond describing the scene.
 6 | To answer such questions, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request.  Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.
 7 | 
 8 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.  
 9 | 
10 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/000_caps.txt:
--------------------------------------------------------------------------------
1 | There is a movie theater that displays the show times above the doors.
2 | A red fire hydrant is deep in the snow.
3 | The fire hydrant is in the snow near a recently plowed sidewalk.
4 | This city has had a very hard winter with snow.
5 | A hotel for dogs in the snow in winter.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/000_conv.txt:
--------------------------------------------------------------------------------
 1 | Question:
 2 | What color is the fire hydrant in the image?
 3 | ===
 4 | Answer:
 5 | The fire hydrant in the image is red.
 6 | ===
 7 | Question:
 8 | Is the fire hydrant fully covered by snow or partially visible?
 9 | ===
10 | Answer:
11 | The fire hydrant is partially visible, with a significant portion of it covered by the deep snow.
12 | ===
13 | Question:
14 | What is the condition of the sidewalk near the fire hydrant?
15 | ===
16 | Answer:
17 | The sidewalk near the fire hydrant has been recently plowed, so it is relatively clear of snow compared to the surrounding area.
18 | ===
19 | Question:
20 | What does the image suggest about the winter season in the city?
21 | ===
22 | Answer:
23 | The image suggests that the winter season in the city has been quite harsh and challenging. The significant snow accumulation seen in the image indicates that the city has experienced heavy snowfall, which can lead to various difficulties and disruptions for its residents. 
24 | 
25 | For instance, the deep snow covering the fire hydrant could potentially hinder its accessibility in case of emergencies, which poses safety concerns. The recently plowed sidewalk implies that the city's maintenance crews have been actively working to keep the walkways clear and safe for pedestrians, but the sheer amount of snowfall might make it difficult to keep up with the continuous snow removal efforts. 
26 | 
27 | Furthermore, such extreme winter conditions can impact transportation, leading to traffic issues, delays in public transit, and increased risks of accidents due to slippery roads. It can also cause problems for businesses and schools, as people might face challenges commuting to work or attending classes. Additionally, the heavy snow can put extra strain on infrastructure, such as roofs and power lines, increasing the likelihood of structural damage or power outages. 
28 | 
29 | In conclusion, the image of the red fire hydrant deep in the snow and the recently plowed sidewalk suggest that the city has faced a particularly severe winter season, with substantial snowfall that has likely caused various challenges and disruptions for its residents and infrastructure.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/001_caps.txt:
--------------------------------------------------------------------------------
1 | A man is skiing in the open snow covered hills
2 | A skier is making his way into the snow.
3 | A skier on the foothills of a huge mountain range.
4 | A skier looks at mountains while standing near a trail sign.
5 | a single person skiing by an area with a lot of bushes


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/001_conv.txt:
--------------------------------------------------------------------------------
 1 | Question:
 2 | What is the skier doing?
 3 | ===
 4 | Answer:
 5 | The skier is skiing in the snow-covered hills, making his way through the snow and enjoying the beautiful mountain scenery. They are also standing near a trail sign, which indicates their location on the mountain.
 6 | ===
 7 | Question:
 8 | Are there any other people or skiers visible in the image?
 9 | ===
10 | Answer:
11 | No, there is only a single person skiing in the image, and they are surrounded by an area with a lot of bushes.
12 | ===
13 | Question:
14 | What kind of skiing is the person doing? Cross-country or downhill?
15 | ===
16 | Answer:
17 | Based on what I can see in the photo, it is not possible to definitively determine whether the skier is doing cross-country or downhill skiing. However, given the presence of a trail sign and the fact that they are skiing in the foothills, it is more likely that the skier is engaged in cross-country skiing. Downhill skiing typically takes place on more steep slopes and involves ski lifts, whereas cross-country skiing often occurs on relatively flatter terrain with marked trails.
18 | ===
19 | Question:
20 | Can you explain cross-country and downhill skiing a bit more?
21 | ===
22 | Answer:
23 | Cross-country skiing and downhill skiing are two distinct types of skiing, each with its own techniques, equipment, and terrain preferences. 
24 | 
25 | 1. Cross-Country Skiing:
26 | Also known as Nordic skiing, cross-country skiing involves traversing relatively flat terrain or gentle slopes using skis and poles. It is a popular recreational and competitive sport that focuses on endurance, balance, and aerobic fitness. Cross-country skiers use their own locomotion to move across snow-covered terrain, rather than relying on ski lifts or other forms of assistance. 
27 | There are two main techniques in cross-country skiing:
28 | - Classic: This technique involves skiing in parallel tracks, using a forward-and-backward gliding motion while keeping the skis parallel to each other. The skier uses their poles to propel themselves forward.
29 | - Skate skiing: This technique is similar to ice skating, where the skier pushes off one ski at an angle and glides on the other. It requires a smooth, groomed surface and is typically faster than the classic technique. 
30 | Cross-country ski equipment consists of lightweight skis, boots, bindings, and poles. The boots are more flexible than those used in downhill skiing, allowing for better ankle movement and control. 
31 | 
32 | 2. Downhill Skiing:
33 | Also known as alpine skiing, downhill skiing involves descending slopes at high speeds using skis and poles for balance and control. This sport is more focused on speed, technique, and navigating challenging terrain, including steep slopes, moguls, and even jumps. 
34 | Downhill skiing can be further categorized into several disciplines, such as slalom, giant slalom, super-G, and downhill racing. Each discipline has its own set of rules, courses, and ski equipment. 
35 | Downhill ski equipment includes heavier and stiffer skis, boots, bindings, and poles than those used in cross-country skiing. The boots are more rigid to provide better support and control during high-speed descents and sharp turns. 
36 | 
37 | In summary, cross-country skiing is an endurance-based sport that involves traveling across flat or gently sloping terrain, while downhill skiing is focused on speed and technique as skiers navigate steeper slopes and challenging terrain. Both sports require specialized equipment and techniques, but they offer different experiences and challenges to participants.


--------------------------------------------------------------------------------
/playground/data/prompts/conversation/system_message.txt:
--------------------------------------------------------------------------------
 1 | You are an AI visual assistant, and you are seeing a single image. What you see are provided with five sentences, describing the same image you are looking at. Answer all questions as you are seeing the image.
 2 | 
 3 | Design a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question.
 4 | Ask diverse questions and give corresponding answers.
 5 | 
 6 | Include questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers:
 7 | (1) one can see the content in the image that the question asks about and can answer confidently;
 8 | (2) one can determine confidently from the image that it is not in the image.
 9 | Do not ask any question that cannot be answered confidently.
10 | 
11 | Also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.
12 | Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized.  You can include multiple paragraphs if necessary.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/000_caps.txt:
--------------------------------------------------------------------------------
 1 | A harbor filled with lots of boats next to a building.
 2 | A bicycle parked in front of several boats at a dock.
 3 | A red bicycle in front of a line of docked white yachts
 4 | A bike sits before boats which sit before a long building.
 5 | A bicycle is a convenient means of land transportation when you live on a boat.
 6 | 
 7 | bicycle: [0.287, 0.641, 0.507, 0.874]
 8 | bicycle: [0.566, 0.667, 0.63, 0.731]
 9 | boat: [0.318, 0.579, 0.575, 0.724]
10 | boat: [0.704, 0.607, 0.818, 0.727]
11 | boat: [0.818, 0.601, 0.942, 0.744]
12 | boat: [0.002, 0.53, 0.243, 0.71]
13 | boat: [0.541, 0.611, 0.668, 0.731]
14 | person: [0.778, 0.527, 0.797, 0.57]
15 | cup: [0.708, 0.733, 0.724, 0.758]
16 | boat: [0.236, 0.532, 0.404, 0.64]
17 | boat: [0.81, 0.632, 0.836, 0.676]
18 | boat: [0.957, 0.526, 1.0, 0.752]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/000_conv.txt:
--------------------------------------------------------------------------------
1 | It is a harbor filled with numerous boats of various sizes docked next to a long building. Among the boats, there are a few white yachts lined up, standing out from the rest. There is a red bicycle prominently parked in front of the line of docked boats, serving as a convenient means of land transportation for those living on the boats. Another bicycle can be seen further back in the scene, near the middle of the harbor.
2 | 
3 | A person is visible near the right side of the harbor, possibly enjoying the view or attending to their boat. Additionally, there is a cup placed on a surface near the middle of the scene.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/001_caps.txt:
--------------------------------------------------------------------------------
 1 | A group of people standing outside of a black vehicle with various luggage.
 2 | Luggage surrounds a vehicle in an underground parking area
 3 | People try to fit all of their luggage in an SUV.
 4 | The sport utility vehicle is parked in the public garage, being packed for a trip
 5 | Some people with luggage near a van that is transporting it.
 6 | 
 7 | person: [0.681, 0.242, 0.774, 0.694]
 8 | person: [0.63, 0.222, 0.686, 0.516]
 9 | person: [0.444, 0.233, 0.487, 0.34]
10 | backpack: [0.384, 0.696, 0.485, 0.914]
11 | backpack: [0.755, 0.413, 0.846, 0.692]
12 | suitcase: [0.758, 0.413, 0.845, 0.69]
13 | suitcase: [0.1, 0.497, 0.173, 0.579]
14 | bicycle: [0.282, 0.363, 0.327, 0.442]
15 | car: [0.786, 0.25, 0.848, 0.322]
16 | car: [0.783, 0.27, 0.827, 0.335]
17 | car: [0.86, 0.254, 0.891, 0.3]
18 | car: [0.261, 0.101, 0.787, 0.626]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/001_conv.txt:
--------------------------------------------------------------------------------
1 | The image is an underground parking area with a black sport utility vehicle (SUV) parked. There are three people in the scene, with one person standing closer to the left side of the vehicle, another person in the middle, and the third person on the right side. They are all working together to pack their luggage into the SUV for a trip. 
2 | 
3 | In the parking area, various luggage items are scattered around the vehicle. There are two backpacks, one located near the left rear wheel and the other closer to the right side of the vehicle. Additionally, there are two suitcases, one on the right side of the car and another further away near the center of the parking area. A bicycle can also be seen on the left side of the vehicle. 
4 | 
5 | Other cars are parked around the main SUV, with one car positioned behind it and slightly to the left, another behind and slightly to the right, and the third car further behind on the right side.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/002_caps.txt:
--------------------------------------------------------------------------------
 1 | A man holds a Wii-mote above his head while another looks on.
 2 | A guy and his friend are playing Nintendo Wii.
 3 | A young man is holding a video game remote over his head.
 4 | two men standing in a room while one plays with a wii mote
 5 | Some guys standing and playing a video game.
 6 | 
 7 | couch: [0.697, 0.759, 0.995, 1.0]
 8 | dining table: [0.426, 0.755, 1.0, 0.987]
 9 | person: [0.082, 0.252, 0.342, 1.0]
10 | person: [0.399, 0.085, 0.742, 0.982]
11 | remote: [0.477, 0.135, 0.516, 0.187]
12 | sink: [0.016, 0.501, 0.063, 0.52]
13 | potted plant: [0.798, 0.384, 0.888, 0.645]
14 | refrigerator: [0.305, 0.389, 0.414, 0.547]
15 | chair: [0.72, 0.509, 0.858, 0.725]


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/002_conv.txt:
--------------------------------------------------------------------------------
1 | The image shows two men standing in a room, engaged in playing a video game on a Nintendo Wii console. One of the men is holding a Wii remote above his head with enthusiasm, while the other man looks on, likely enjoying the friendly competition. 
2 | 
3 | The room appears to be a living space with a couch located in the background and a dining table nearby. A potted plant can be seen placed close to the couch, and a chair is situated in the middle of the room. The room also features a kitchen area with a sink and a refrigerator visible in the background.


--------------------------------------------------------------------------------
/playground/data/prompts/detail_description/system_message.txt:
--------------------------------------------------------------------------------
1 | You are an AI visual assistant that can analyze a single image. You receive five sentences, each describing the same image you are observing. In addition, specific object locations within the image are given, along with detailed coordinates. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y.
2 | 
3 | Using the provided caption and bounding box information, describe the scene in a detailed manner.
4 | 
5 | Instead of directly mentioning the bounding box coordinates, utilize this data to explain the scene using natural language. Include details like object counts, position of the objects, relative position between the objects.
6 | 
7 | When using the information from the caption and coordinates, directly explain the scene, and do not mention that the information source is the caption or the bounding box.  Always answer as if you are directly looking at the image.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "maya"
 7 | version = "1.0"
 8 | description = "Maya: An Instruction Finetuned Multilingual Multimodal Model."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | 
16 | dependencies = [
17 |     "torch==2.1.2", "torchvision==0.16.2",
18 |     "transformers==4.47.0", "tokenizers==0.21", "sentencepiece==0.1.99", "shortuuid==1.0.13",
19 |     "accelerate==0.27.2", "datasets==2.15.0", "peft==0.12.0", "bitsandbytes==0.43.3",
20 |     "pydantic==2.8.2", "markdown2[all]", "numpy==1.26.4", "scikit-learn==1.2.2",
21 |     "gradio==4.16.0", "gradio_client==0.8.1",
22 |     "requests==2.32.3", "httpx==0.24.0", "uvicorn", "fastapi",
23 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
24 | ]
25 | 
26 | [project.optional-dependencies]
27 | train = ["deepspeed==0.14.2", "ninja", "wandb==0.17.8"]
28 | build = ["build", "twine"]
29 | 
30 | [project.urls]
31 | "Homepage" = "https://huggingface.co/maya-multimodal"
32 | "Bug Tracker" = "https://github.com/nahidalam/maya/issues"
33 | 
34 | [tool.setuptools.packages.find]
35 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
36 | 
37 | [tool.wheel]
38 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
39 | 


--------------------------------------------------------------------------------
/requirements.sh:
--------------------------------------------------------------------------------
1 |  pip install chardet==5.2.0
2 |  pip install datasets==2.15.0
3 |  pip install deepspeed==0.14.2
4 |  pip install fastapi==0.111.0
5 |  pip install transformers==4.42.3
6 |  pip install accelerate==0.27.2
7 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 | 
53 |     return results
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 |     eval_single(args.result_file, eval_only_type='video')
63 | 
64 |     with open(args.result_upload_file, 'w') as fp:
65 |         for question in data['questions']:
66 |             qid = question['question_id']
67 |             if qid in results:
68 |                 result = results[qid]
69 |             else:
70 |                 result = results[int(qid)]
71 |             fp.write(json.dumps({
72 |                 'question_id': qid,
73 |                 'prediction': result['text']
74 |             }) + '\n')
75 | 


--------------------------------------------------------------------------------
/scripts/convert_sqa_to_llava.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fire
 4 | import re
 5 | from convert_sqa_to_llava_base_prompt import build_prompt_chatbot
 6 | 
 7 | 
 8 | def convert_to_llava(base_dir, split, prompt_format="QCM-LEA"):
 9 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
10 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
11 | 
12 |     split_problems = build_prompt_chatbot(
13 |         problems, split_indices, prompt_format,
14 |         use_caption=False, is_test=False)
15 | 
16 |     target_format = []
17 |     for prob_id, (input, output) in split_problems.items():
18 |         if input.startswith('Question: '):
19 |             input = input.replace('Question: ', '')
20 |         if output.startswith('Answer: '):
21 |             output = output.replace('Answer: ', '')
22 | 
23 |         raw_prob_data = problems[prob_id]
24 |         if raw_prob_data['image'] is None:
25 |             target_format.append({
26 |                 "id": prob_id,
27 |                 "conversations": [
28 |                     {'from': 'human', 'value': f"{input}"},
29 |                     {'from': 'gpt', 'value': f"{output}"},
30 |                 ],
31 |             })
32 | 
33 |         else:
34 |             target_format.append({
35 |                 "id": prob_id,
36 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
37 |                 "conversations": [
38 |                     {'from': 'human', 'value': f"{input}\n<image>"},
39 |                     {'from': 'gpt', 'value': f"{output}"},
40 |                 ],
41 |             })
42 | 
43 |     print(f'Number of samples: {len(target_format)}')
44 | 
45 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
46 |         json.dump(target_format, f, indent=2)
47 | 
48 | 
49 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
50 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[split]
51 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
52 | 
53 |     split_problems = build_prompt_chatbot(
54 |         problems, split_indices, prompt_format,
55 |         use_caption=False, is_test=False)
56 | 
57 |     writer = open(os.path.join(base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
58 |     for prob_id, (input, output) in split_problems.items():
59 |         if input.startswith('Question: '):
60 |             input = input.replace('Question: ', '')
61 |         if output.startswith('Answer: '):
62 |             output = output.replace('Answer: ', '')
63 | 
64 |         raw_prob_data = problems[prob_id]
65 |         if raw_prob_data['image'] is None:
66 |             data = {
67 |                 "id": prob_id,
68 |                 "instruction": f"{input}",
69 |                 "output": f"{output}",
70 |             }
71 | 
72 |         else:
73 |             data = {
74 |                 "id": prob_id,
75 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
76 |                 "instruction": f"{input}\n<image>",
77 |                 "output": f"{output}",
78 |             }
79 |         writer.write(json.dumps(data) + '\n')
80 |     writer.close()
81 | 
82 | 
83 | def main(task, **kwargs):
84 |     globals()[task](**kwargs)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     fire.Fire(main)
89 | 


--------------------------------------------------------------------------------
/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--annotation-file', type=str, required=True)
11 |     parser.add_argument('--result-file', type=str, required=True)
12 |     parser.add_argument('--result-upload-file', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 | 
22 |     results = []
23 |     error_line = 0
24 |     for line_idx, line in enumerate(open(args.result_file)):
25 |         try:
26 |             results.append(json.loads(line))
27 |         except:
28 |             error_line += 1
29 |     
30 |     # Map question_id (concatenation of image + question) to result text
31 |     results = {x['question_id']: x['text'] for x in results}
32 | 
33 |     # Load the entire annotation file (as it's a JSON array)
34 |     with open(args.annotation_file, 'r') as f:
35 |         test_split = json.load(f)
36 | 
37 |     # Generate question_id by concatenating image + _ + question
38 |     split_ids = set([x['image'] +"_"+ x['question'] for x in test_split])
39 | 
40 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
41 | 
42 |     all_answers = []
43 | 
44 |     answer_processor = EvalAIAnswerProcessor()
45 | 
46 |     for x in test_split:
47 |         question_id = x['question'] + "_"+ x['image']
48 |         try:
49 |             assert question_id in results
50 |         except:
51 |             break
52 |         all_answers.append({
53 |             'image': x['image'],
54 |             'question' : x['question'],
55 |             'answer': answer_processor(results[question_id])
56 |         })
57 | 
58 |     # Save the processed answers
59 |     with open(args.result_upload_file, 'w') as f:
60 |         json.dump(all_answers, f)
61 | 


--------------------------------------------------------------------------------
/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
11 |     parser.add_argument('--ckpt', type=str, required=True)
12 |     parser.add_argument('--split', type=str, required=True)
13 |     return parser.parse_args()
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     args = parse_args()
19 | 
20 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
21 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
22 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
23 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
24 | 
25 |     results = []
26 |     error_line = 0
27 |     for line_idx, line in enumerate(open(src)):
28 |         try:
29 |             results.append(json.loads(line))
30 |         except:
31 |             error_line += 1
32 | 
33 |     results = {x['question_id']: x['text'] for x in results}
34 |     test_split = [json.loads(line) for line in open(test_split)]
35 |     split_ids = set([x['question_id'] for x in test_split])
36 | 
37 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
38 | 
39 |     all_answers = []
40 | 
41 |     answer_processor = EvalAIAnswerProcessor()
42 | 
43 |     for x in test_split:
44 |         if x['question_id'] not in results:
45 |             all_answers.append({
46 |                 'question_id': x['question_id'],
47 |                 'answer': ''
48 |             })
49 |         else:
50 |             all_answers.append({
51 |                 'question_id': x['question_id'],
52 |                 'answer': answer_processor(results[x['question_id']])
53 |             })
54 | 
55 |     with open(dst, 'w') as f:
56 |         json.dump(all_answers, open(dst, 'w'))
57 | 


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/scripts/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 |     --version $PROMPT_VERSION \
21 |     --data_path ./playground/data/llava_instruct_80k.json \
22 |     --image_folder /path/to/coco/train2017 \
23 |     --vision_tower openai/clip-vit-large-patch14 \
24 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 |     --num_train_epochs 1 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/scripts/finetune_full_schedule.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 |     --version $PROMPT_VERSION \
21 |     --data_path ./playground/data/llava_instruct_158k.json \
22 |     --image_folder /path/to/coco/train2017 \
23 |     --vision_tower openai/clip-vit-large-patch14 \
24 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --bf16 True \
29 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
30 |     --num_train_epochs 3 \
31 |     --per_device_train_batch_size 16 \
32 |     --per_device_eval_batch_size 4 \
33 |     --gradient_accumulation_steps 1 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 50000 \
37 |     --save_total_limit 1 \
38 |     --learning_rate 2e-5 \
39 |     --weight_decay 0. \
40 |     --warmup_ratio 0.03 \
41 |     --lr_scheduler_type "cosine" \
42 |     --logging_steps 1 \
43 |     --tf32 True \
44 |     --model_max_length 2048 \
45 |     --gradient_checkpointing True \
46 |     --dataloader_num_workers 4 \
47 |     --lazy_preprocess True \
48 |     --report_to wandb
49 | 


--------------------------------------------------------------------------------
/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --lora_enable True \
20 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
21 |     --version $PROMPT_VERSION \
22 |     --data_path ./playground/data/llava_instruct_80k.json \
23 |     --image_folder /path/to/coco/train2017 \
24 |     --vision_tower openai/clip-vit-large-patch14 \
25 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
26 |     --mm_vision_select_layer -2 \
27 |     --mm_use_im_start_end False \
28 |     --mm_use_im_patch_token False \
29 |     --bf16 True \
30 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
31 |     --num_train_epochs 1 \
32 |     --per_device_train_batch_size 16 \
33 |     --per_device_eval_batch_size 4 \
34 |     --gradient_accumulation_steps 1 \
35 |     --evaluation_strategy "no" \
36 |     --save_strategy "steps" \
37 |     --save_steps 50000 \
38 |     --save_total_limit 1 \
39 |     --learning_rate 2e-5 \
40 |     --weight_decay 0. \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --lazy_preprocess True \
48 |     --dataloader_num_workers 4 \
49 |     --report_to wandb
50 | 


--------------------------------------------------------------------------------
/scripts/finetune_qlora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | ################## VICUNA ##################
 8 | # PROMPT_VERSION=v1
 9 | # MODEL_VERSION="vicuna-v1-3-7b"
10 | ################## VICUNA ##################
11 | 
12 | ################## LLaMA-2 ##################
13 | # PROMPT_VERSION="llava_llama_2"
14 | # MODEL_VERSION="llama-2-7b-chat"
15 | ################## LLaMA-2 ##################
16 | 
17 | deepspeed llava/train/train_mem.py \
18 |     --deepspeed ./scripts/zero2.json \
19 |     --lora_enable True \
20 |     --bits 4 \
21 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
22 |     --version $PROMPT_VERSION \
23 |     --data_path ./playground/data/llava_instruct_80k.json \
24 |     --image_folder /path/to/coco/train2017 \
25 |     --vision_tower openai/clip-vit-large-patch14 \
26 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
27 |     --mm_vision_select_layer -2 \
28 |     --mm_use_im_start_end False \
29 |     --mm_use_im_patch_token False \
30 |     --bf16 True \
31 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
32 |     --num_train_epochs 1 \
33 |     --per_device_train_batch_size 16 \
34 |     --per_device_eval_batch_size 4 \
35 |     --gradient_accumulation_steps 1 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 50000 \
39 |     --save_total_limit 1 \
40 |     --learning_rate 2e-5 \
41 |     --weight_decay 0. \
42 |     --warmup_ratio 0.03 \
43 |     --lr_scheduler_type "cosine" \
44 |     --logging_steps 1 \
45 |     --tf32 True \
46 |     --model_max_length 2048 \
47 |     --gradient_checkpointing True \
48 |     --lazy_preprocess True \
49 |     --dataloader_num_workers 4 \
50 |     --report_to wandb
51 | 


--------------------------------------------------------------------------------
/scripts/finetune_sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | deepspeed llava/train/train_mem.py \
 6 |     --deepspeed ./scripts/zero2.json \
 7 |     --model_name_or_path lmsys/vicuna-13b-v1.3 \
 8 |     --version $PROMPT_VERSION \
 9 |     --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
10 |     --image_folder /Data/ScienceQA/data/scienceqa/images/train \
11 |     --vision_tower openai/clip-vit-large-patch14 \
12 |     --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
18 |     --num_train_epochs 12 \
19 |     --per_device_train_batch_size 16 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 1 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 50000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 2e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --report_to wandb
37 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="maya_full_ft"
 9 | # CKPT="maya_toxicity_free_finetuned"
10 | SPLIT="llava_gqa_testdev_balanced"
11 | GQADIR="./playground/data/eval/gqa/data"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 |         --model-path nahidalam/$CKPT \
16 |         --model-base CohereForAI/aya-23-8B \
17 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
18 |         --image-folder ./playground/data/eval/gqa/data/images \
19 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --temperature 0 \
23 |         --conv-mode aya &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
39 | 
40 | cd $GQADIR
41 | python eval/eval.py --tier testdev_balanced
42 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/load_mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)"
 4 | 
 5 | LOAD_MME_PY="$PROJECT_ROOT/llava/eval/maya/load_mme.py"
 6 | 
 7 | 
 8 | cd "$PROJECT_ROOT"
 9 | 
10 | python "$LOAD_MME_PY"
11 | 
12 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_20230712"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path nahidalam/maya_full_ft \
 7 |     --model-base CohereForAI/aya-23-8B \
 8 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/maya_full_ft.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode aya
13 | 
14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment maya_full_ft
21 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../" && pwd)"
 4 | 
 5 | # LOAD_MME_PY="$PROJECT_ROOT/llava/eval/maya/load_mme.py"
 6 | MODEL_VQA_LOADER_PY="$PROJECT_ROOT/llava/eval/model_vqa_loader.py"
 7 | CONVERT_ANSWER_TO_MME_PY="$PROJECT_ROOT/llava/eval/maya/convert_answer_to_mme.py"
 8 | MME_CALCULATE_PY="$PROJECT_ROOT/llava/eval/maya/calculate_mme.py"
 9 | 
10 | 
11 | cd "$PROJECT_ROOT"
12 | 
13 | # Run this line once if you want to load the data from Hugging Face (caches to disk). load_mme.sh is the equivalent
14 | # python "$LOAD_MME_PY
15 | 
16 | 
17 | python -m llava.eval.model_vqa_loader \
18 |     --model-path "nahidalam/maya_full_ft" \
19 |     --model-base "CohereForAI/aya-23-8B" \
20 |     --question-file "./playground/data/eval/MME/llava_mme.jsonl" \
21 |     --image-folder "./playground/data/eval/MME/MME_Benchmark_release_version" \
22 |     --answers-file "./playground/data/eval/MME/answers/maya_full_ft.jsonl" \
23 |     --temperature "0" \
24 |     --conv-mode "aya"
25 | 
26 | 
27 | cd "$PROJECT_ROOT/playground/data/eval/MME"
28 | 
29 | python "$CONVERT_ANSWER_TO_MME_PY" \
30 |     --experiment "maya_full_ft"
31 | 
32 | 
33 | cd "$PROJECT_ROOT/llava/eval/maya"
34 | 
35 | python "$MME_CALCULATE_PY" \
36 |     --results_dir "../../../playground/data/eval/MME/eval_tool/answers/maya_full_ft"
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path nahidalam/maya_full_ft \
 5 |     --model-base CohereForAI/aya-23-8B \
 6 |     --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
 7 |     --image-folder ./playground/data/eval/mm-vet/images \
 8 |     --answers-file ./playground/data/eval/mm-vet/answers/maya_full_ft.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode aya
11 | 
12 | mkdir -p ./playground/data/eval/mm-vet/results_maya
13 | 
14 | python scripts/convert_mmvet_for_eval.py \
15 |     --src ./playground/data/eval/mm-vet/answers/maya_full_ft.jsonl \
16 |     --dst ./playground/data/eval/mm-vet/results_maya/maya_full_ft.json
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/palo_multilingual_llava_bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Evaluates Maya on the PALO Multilingual-LLaVA Bench-In-The-Wild dataset.
  4 | #
  5 | # Usage:
  6 | #   bash scripts/maya/eval/palo_multilingual_llava_bench.sh \
  7 | #     <MODEL_BASE> <MODEL_PATH> <MODE> <OPENAI_API_KEY> [PROJECTOR_PATH]
  8 | #
  9 | # Note:
 10 | #   - MODE: Choose either 'pretrained' or 'finetuned' (without quotes). Example: finetuned
 11 | #   - PROJECTOR_PATH: required if MODE=pretrained
 12 | #   - See Readme for more details 
 13 | 
 14 | 
 15 | 
 16 | export MULTILINGUAL_LLAVA_BENCH_PATH="playground/data/eval/multilingual-llava-bench-in-the-wild"
 17 | export OUTPUT_DIR="evaluation"
 18 | export IMAGES="$MULTILINGUAL_LLAVA_BENCH_PATH/images"
 19 | export PYTHONPATH="./:$PYTHONPATH"
 20 | 
 21 | 
 22 | MODEL_BASE=$1
 23 | MODEL_PATH=$2
 24 | MODE=$3
 25 | export OPENAI_API_KEY=$4
 26 | PROJECTOR_PATH=$5
 27 | 
 28 | 
 29 | evaluate_language() {
 30 |     local LANG=$1
 31 |     local QUESTIONS=$2
 32 |     local ANSWERS=$3
 33 |     local CONTEXT=$4
 34 |     local OUTPUT_FILE="Maya_${LANG}.jsonl"
 35 | 
 36 |     echo "******** Evaluating Maya on $LANG ********"
 37 |     
 38 |     cmd="python llava/eval/model_vqa_maya.py \
 39 |         --model-base "$MODEL_BASE" \
 40 |         --model-path "$MODEL_PATH" \
 41 |         --mode $MODE \
 42 |         --conv-mode aya \
 43 |         --question-file "$QUESTIONS" \
 44 |         --image-folder "$IMAGES" \
 45 |         --answers-file "$OUTPUT_DIR/$OUTPUT_FILE" \
 46 |         --temperature 0"
 47 | 
 48 |     # Add projector path if provided
 49 |     if [ ! -z "$PROJECTOR_PATH" ]; then
 50 |         cmd+=" --projector-path \"$PROJECTOR_PATH\""
 51 |     fi
 52 | 
 53 |     # Execute the command
 54 |     eval $cmd
 55 | 
 56 |     mkdir -p "$OUTPUT_DIR/reviews"
 57 | 
 58 |     python llava/eval/eval_gpt_review_bench.py \
 59 |         --question "$QUESTIONS" \
 60 |         --context "$CONTEXT" \
 61 |         --rule llava/eval/table/rule.json \
 62 |         --answer-list \
 63 |             "$ANSWERS" \
 64 |             "$OUTPUT_DIR/$OUTPUT_FILE" \
 65 |         --output \
 66 |             "$OUTPUT_DIR/reviews/$OUTPUT_FILE"      
 67 | 
 68 |     python llava/eval/summarize_gpt_review.py -f "$OUTPUT_DIR/reviews/$OUTPUT_FILE"
 69 | 
 70 | }
 71 | 
 72 | # Evaluate each language
 73 | # 1.English
 74 | evaluate_language "English" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/questions.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/answers_gpt4.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/english/context.jsonl"
 75 | 
 76 | # 2.Chinese
 77 | evaluate_language "Chinese" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/chinese/context.jsonl"
 78 | 
 79 | # 3.Spanish
 80 | evaluate_language "Spanish" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/spanish/context_spanish.jsonl"
 81 | 
 82 | # 4.French
 83 | evaluate_language "French" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/french/context.jsonl"
 84 | 
 85 | # 5.Russian
 86 | evaluate_language "Russian" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/russian/context.jsonl"
 87 | 
 88 | # 6.Arabic
 89 | evaluate_language "Arabic" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/arabic/context.jsonl"
 90 | 
 91 | # 7.Bengali -- Not currently supported
 92 | evaluate_language "Bengali" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/bengali/context.jsonl"
 93 | 
 94 | # 8.Hindi
 95 | evaluate_language "Hindi" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/hindi/context.jsonl"
 96 | 
 97 | # 9.Urdu -- Not currently supported
 98 | evaluate_language "Urdu" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/urdu/context.jsonl"
 99 | 
100 | # 10.Japanese
101 | evaluate_language "Japanese" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/question.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/answers.jsonl" "$MULTILINGUAL_LLAVA_BENCH_PATH/japanese/context.jsonl"
102 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python -m llava.eval.model_vqa_loader \
 3 |     --model-path nahidalam/maya_full_ft \
 4 |     --model-base CohereForAI/aya-23-8B \
 5 |     --question-file ./playground/data/eval/pope/maya_pope_test.jsonl \
 6 |     --image-folder ./playground/data/eval/pope/val2014 \
 7 |     --answers-file ./playground/data/eval/pope/answers/maya_full_ft.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode aya
10 | 
11 | python llava/eval/eval_pope.py \
12 |     --annotation-dir ./playground/data/eval/pope/coco \
13 |     --question-file ./playground/data/eval/pope/maya_pope_test.jsonl \
14 |     --result-file ./playground/data/eval/pope/answers/maya_full_ft.jsonl
15 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/process_prediction.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Load the Excel file
 4 | file_path = './playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712/maya_full_ft.xlsx'
 5 | excel_data = pd.read_excel(file_path)
 6 | 
 7 | # Extract the 'prediction' and 'answer' columns
 8 | predictions = excel_data['prediction']
 9 | answers = excel_data['answer']
10 | 
11 | # Calculate accuracy by comparing predictions to answers
12 | accuracy = (predictions == answers).mean()
13 | 
14 | # Print the accuracy
15 | print(f"Accuracy: {accuracy * 100:.2f}%")
16 | 
17 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_science \
 4 |     --model-path nahidalam/maya_full_ft \
 5 |     --model-base CohereForAI/aya-23-8B \
 6 |     --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
 7 |     --image-folder ./playground/data/eval/scienceqa/images/test \
 8 |     --answers-file ./playground/data/eval/scienceqa/answers/maya_full_ft.jsonl \
 9 |     --single-pred-prompt \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | python llava/eval/eval_science_qa.py \
14 |     --base-dir ./playground/data/eval/scienceqa \
15 |     --result-file ./playground/data/eval/scienceqa/answers/maya_full_ft.jsonl \
16 |     --output-file ./playground/data/eval/scienceqa/answers/maya_full_ft_output.jsonl \
17 |     --output-result ./playground/data/eval/scienceqa/answers/maya_full_ft_result.json
18 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path nahidalam/maya_full_ft \
 5 |     --model-base CohereForAI/aya-23-8B \
 6 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
 7 |     --image-folder ./playground/data/eval/textvqa/train_images \
 8 |     --answers-file ./playground/data/eval/textvqa/answers/maya_full_ft.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode vicuna_v1
11 | 
12 | python -m llava.eval.eval_textvqa \
13 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
14 |     --result-file ./playground/data/eval/textvqa/answers/maya_full_ft.jsonl
15 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_vizwiz \
 4 |     --model-path nahidalam/maya_full_ft \
 5 |     --model-base CohereForAI/aya-23-8B \
 6 |     --question-file ./playground/data/eval/vizwiz/test.json \
 7 |     --image-folder ./playground/data/eval/vizwiz/test \
 8 |     --answers-file ./playground/data/eval/vizwiz/answers/maya_test.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode aya
11 | 
12 | python -m llava.eval.model_vqa_loader \
13 |     --model-path nahidalam/maya_full_ft \
14 |     --model-base CohereForAI/aya-23-8B \
15 |     --question-file ./playground/data/eval/vizwiz/llava_test_short.jsonl \
16 |     --image-folder  ./playground/data/eval/vizwiz/test \
17 |     --answers-file  ./playground/data/eval/vizwiz/answers/maya.jsonl \
18 |     --temperature 0 \
19 |     --conv-mode aya
20 | 
21 | 
22 | python scripts/convert_vizwiz_for_submission.py \
23 |     --annotation-file ./playground/data/eval/vizwiz/test.json \
24 |     --result-file ./playground/data/eval/vizwiz/answers/maya_test.jsonl \
25 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/maya_full_ft.json
26 | 


--------------------------------------------------------------------------------
/scripts/maya/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="maya_full_ft"
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path nahidalam/maya_full_ft \
14 | 	--model-base CohereForAI/aya-23-8B \
15 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
16 |         --image-folder ./playground/data/eval/vqav2/test2015 \
17 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode aya &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/maya/finetune_aya_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path CohereForAI/aya-23-8B \
 6 |     --version aya \
 7 |     --data_path /dev/data/annotations/palo_multilingual_dataset.json \
 8 |     --image_folder /dev/data/instruction_tune_dataset/ \
 9 |     --vision_tower google/siglip-base-patch16-256-multilingual \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip-multi-8lang-32bz-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/maya \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 4 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 4 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/maya/finetune_aya_siglip2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path CohereForAI/aya-23-8B \
 6 |     --version aya \
 7 |     --data_path /dev/data/annotations/palo_multilingual_dataset.json \
 8 |     --image_folder /dev/data/instruction_tune_dataset/ \
 9 |     --vision_tower google/siglip2-base-patch16-256 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip2-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/maya \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 4 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 4 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/maya/pretrain_aya_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | MODEL_VERSION=aya-23-8b-siglip-multi
 5 | 
 6 | ########### DO NOT CHANGE ###########
 7 | ########### USE THIS FOR BOTH ###########
 8 | PROMPT_VERSION=plain
 9 | ########### DO NOT CHANGE ###########
10 | 
11 | deepspeed llava/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path CohereForAI/aya-23-8B \
14 |     --version $PROMPT_VERSION \
15 |     --data_path /dev/data/LLaVA_Pretrain \
16 |     --image_folder /dev/data/images \
17 |     --vision_tower google/siglip-base-patch16-256-multilingual \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --bf16 True \
24 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 4 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 8 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 


--------------------------------------------------------------------------------
/scripts/maya/pretrain_aya_siglip2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | MODEL_VERSION=aya-23-8b-siglip2
 5 | 
 6 | ########### DO NOT CHANGE ###########
 7 | ########### USE THIS FOR BOTH ###########
 8 | PROMPT_VERSION=plain
 9 | ########### DO NOT CHANGE ###########
10 | 
11 | deepspeed llava/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path CohereForAI/aya-23-8B \
14 |     --version $PROMPT_VERSION \
15 |     --data_path /dev/data/LLaVA_Pretrain \
16 |     --image_folder /dev/data/images \
17 |     --vision_tower google/siglip2-base-patch16-256 \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --bf16 True \
24 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 4 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 8 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from llava.model.builder import load_pretrained_model
 3 | from llava.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
 4 | 
 5 | # Uncomment and set the following variables correspondingly to run this script:
 6 | 
 7 | # MODEL_VERSION=vicuna-v1-3-7b
 8 | # MODEL_VERSION=llama-2-7b-chat
 9 | 
10 | ########### DO NOT CHANGE ###########
11 | ########### USE THIS FOR BOTH ###########
12 | PROMPT_VERSION=plain
13 | ########### DO NOT CHANGE ###########
14 | 
15 | deepspeed llava/train/train_mem.py \
16 |     --deepspeed ./scripts/zero2.json \
17 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
18 |     --version $PROMPT_VERSION \
19 |     --data_path /path/to/pretrain_data.json \
20 |     --image_folder /path/to/images \
21 |     --vision_tower openai/clip-vit-large-patch14 \
22 |     --tune_mm_mlp_adapter True \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --bf16 True \
27 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
28 |     --num_train_epochs 1 \
29 |     --per_device_train_batch_size 16 \
30 |     --per_device_eval_batch_size 4 \
31 |     --gradient_accumulation_steps 1 \
32 |     --evaluation_strategy "no" \
33 |     --save_strategy "steps" \
34 |     --save_steps 24000 \
35 |     --save_total_limit 1 \
36 |     --learning_rate 2e-3 \
37 |     --weight_decay 0. \
38 |     --warmup_ratio 0.03 \
39 |     --lr_scheduler_type "cosine" \
40 |     --logging_steps 1 \
41 |     --tf32 True \
42 |     --model_max_length 2048 \
43 |     --gradient_checkpointing True \
44 |     --dataloader_num_workers 4 \
45 |     --lazy_preprocess True \
46 |     --report_to wandb
47 | 


--------------------------------------------------------------------------------
/scripts/pretrain_xformers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Uncomment and set the following variables correspondingly to run this script:
 4 | 
 5 | # MODEL_VERSION=vicuna-v1-3-7b
 6 | # MODEL_VERSION=llama-2-7b-chat
 7 | 
 8 | ########### DO NOT CHANGE ###########
 9 | ########### USE THIS FOR BOTH ###########
10 | PROMPT_VERSION=plain
11 | ########### DO NOT CHANGE ###########
12 | 
13 | deepspeed llava/train/train_xformers.py \
14 |     --deepspeed ./scripts/zero2.json \
15 |     --model_name_or_path ./checkpoints/$MODEL_VERSION \
16 |     --version $PROMPT_VERSION \
17 |     --data_path /path/to/pretrain_data.json \
18 |     --image_folder /path/to/images \
19 |     --vision_tower openai/clip-vit-large-patch14 \
20 |     --tune_mm_mlp_adapter True \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --bf16 False \
25 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 4 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 2e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 False \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb
45 | 


--------------------------------------------------------------------------------
/scripts/sqa_eval_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | for IDX in {0..7}; do
 5 |     CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
 6 |         --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
 7 |         --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
 8 |         --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
 9 |         --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10 |         --num-chunks $CHUNKS \
11 |         --chunk-idx $IDX \
12 |         --conv-mode llava_v1 &
13 | done
14 | 


--------------------------------------------------------------------------------
/scripts/sqa_eval_gather.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHUNKS=8
 4 | output_file="test_llava-13b.jsonl"
 5 | 
 6 | # Clear out the output file if it exists.
 7 | > "$output_file"
 8 | 
 9 | # Loop through the indices and concatenate each file.
10 | for idx in $(seq 0 $((CHUNKS-1))); do
11 |   cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
12 | done
13 | 
14 | python llava/eval/eval_science_qa.py \
15 |     --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
16 |     --result-file ./test_llava-13b.jsonl \
17 |     --output-file ./test_llava-13b_output.json \
18 |     --output-result ./test_llava-13b_result.json
19 | 


--------------------------------------------------------------------------------
/scripts/upload_pypi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Step 0: Clean up
 4 | rm -rf dist
 5 | 
 6 | # Step 1: Change the package name to "llava-torch"
 7 | sed -i 's/name = "llava"/name = "llava-torch"/' pyproject.toml
 8 | 
 9 | # Step 2: Build the package
10 | python -m build
11 | 
12 | # Step 3: Revert the changes in pyproject.toml to the original
13 | sed -i 's/name = "llava-torch"/name = "llava"/' pyproject.toml
14 | 
15 | # Step 4: Upload to PyPI
16 | python -m twine upload dist/*
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/eval_all_languages_deprecated.sh:
--------------------------------------------------------------------------------
 1 | # This script for PALO Multilingual LLaVA Bench In-the-Wild Benchmark is deprecated. 
 2 | # Please use the new script at LLaVA/scripts/maya/eval/palo_multilingual_llava_bench.sh
 3 | 
 4 | 
 5 | 
 6 | #!/bin/bash
 7 | 
 8 | export MULTILINGUAL_LLAVA_BENCH_PATH="playground/data/eval/multilingual-llava-bench-in-the-wild"
 9 | export OUTPUT_DIR="evaluation"
10 | export IMAGES="$MULTILINGUAL_LLAVA_BENCH_PATH/images"
11 | 
12 | #export MODEL="/path/to/palo-v1.5-7b-665en_150K_of_arr_chi_hin_spa_ben_fr_jap_rus_ur"
13 | #export MODEL_NAME="palo-v1.5-7b-665en_150K_of_arr_chi_hin_spa_ben_fr_jap_rus_ur"
14 | #export OPENAI_API_KEY="write your open-ai key"
15 | 
16 | MODEL_BASE=$1
17 | MODEL_PATH=$2
18 | PROJECTOR_PATH=$3
19 | MODEL_NAME=$4
20 | export OPENAI_API_KEY=$5
21 | 
22 | export PYTHONPATH="./:$PYTHONPATH"
23 | 
24 | # 1.English
25 | bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/questions.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/answers_gpt4.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/english/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_English.jsonl
26 | 
27 | # 2.Chinese
28 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/chinese/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Chinese.jsonl
29 | 
30 | # 3.Spanish
31 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/spanish/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Spanish.jsonl
32 | 
33 | # 4.French
34 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/french/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_French.jsonl
35 | 
36 | # 6.Russian
37 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/russian/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Russian.jsonl
38 | 
39 | # 7.Arabic
40 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/arabic/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Arabic.jsonl
41 | 
42 | # 8.Bengali
43 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/bengali/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Bengali.jsonl
44 | 
45 | # 9.Hindi
46 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/hindi/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Hindi.jsonl
47 | 
48 | # 10.Urdu
49 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/urdu/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Urdu.jsonl
50 | 
51 | # 11.Japanese
52 | # bash scripts/v1_5/eval/llavabench_palo.sh "$IMAGES" "$MODEL_BASE" "$MODEL_PATH" "$PROJECTOR_PATH" "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/question.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/answers.jsonl "$MULTILINGUAL_LLAVA_BENCH_PATH"/japanese/context.jsonl "$OUTPUT_DIR" "$MODEL_NAME"_Japanese.jsonl
53 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-v1.5-13b"
 9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 |         --model-path liuhaotian/llava-v1.5-13b \
15 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
16 |         --image-folder ./playground/data/eval/gqa/data/images \
17 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
37 | 
38 | cd $GQADIR
39 | python eval/eval.py --tier testdev_balanced
40 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
 6 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
 7 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
12 | 
13 | python llava/eval/eval_gpt_review_bench.py \
14 |     --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
15 |     --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
16 |     --rule llava/eval/table/rule.json \
17 |     --answer-list \
18 |         playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
19 |         playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
20 |     --output \
21 |         playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
22 | 
23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
24 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/llavabench_palo_deprecated.sh:
--------------------------------------------------------------------------------
 1 | # This script for PALO Multilingual LLaVA Bench In-the-Wild Benchmark is deprecated. 
 2 | # Please use the new script at LLaVA/scripts/maya/eval/palo_multilingual_llava_bench.sh
 3 | 
 4 | 
 5 | 
 6 | #!/bin/bash
 7 | 
 8 | IMAGES=$1
 9 | MODEL_BASE=$2
10 | MODEL_PATH=$3
11 | PROJECTOR_PATH=$4
12 | QUESTIONS=$5
13 | ANSWERS=$6
14 | CONTEXT=$7
15 | OUTPUT_DIR=$8
16 | OUTPUT_FILE=$9
17 | 
18 | 
19 | python llava/eval/model_vqa_maya.py \
20 |     --model-base "$MODEL_BASE" \
21 |     --model-path "$MODEL_PATH" \
22 |     --projector-path "$PROJECTOR_PATH" \
23 |     --question-file "$QUESTIONS" \
24 |     --image-folder "$IMAGES" \
25 |     --answers-file "$OUTPUT_DIR/$OUTPUT_FILE" \
26 |     --temperature 0 \
27 |     --conv-mode aya
28 | 
29 | mkdir -p "$OUTPUT_DIR/reviews"
30 | 
31 | python llava/eval/eval_gpt_review_bench.py \
32 |     --question "$QUESTIONS" \
33 |     --context "$CONTEXT" \
34 |     --rule llava/eval/table/rule.json \
35 |     --answer-list \
36 |         "$ANSWERS" \
37 |         "$OUTPUT_DIR/$OUTPUT_FILE" \
38 |     --output \
39 |         "$OUTPUT_DIR/reviews/$OUTPUT_FILE"
40 | 
41 | python llava/eval/summarize_gpt_review.py -f "$OUTPUT_DIR/reviews/$OUTPUT_FILE"
42 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_20230712"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path liuhaotian/llava-v1.5-13b \
 7 |     --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
 8 |     --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \
 9 |     --single-pred-prompt \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
14 | 
15 | python scripts/convert_mmbench_for_submission.py \
16 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
17 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
18 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
19 |     --experiment llava-v1.5-13b
20 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPLIT="mmbench_dev_cn_20231003"
 4 | 
 5 | python -m llava.eval.model_vqa_mmbench \
 6 |     --model-path liuhaotian/llava-v1.5-13b \
 7 |     --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
 8 |     --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \
 9 |     --lang cn \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode vicuna_v1
13 | 
14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
18 |     --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \
19 |     --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \
20 |     --experiment llava-v1.5-13b
21 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/MME/llava_mme.jsonl \
 6 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 7 |     --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | cd ./playground/data/eval/MME
12 | 
13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b
14 | 
15 | cd eval_tool
16 | 
17 | python calculation.py --results_dir answers/llava-v1.5-13b
18 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
 6 |     --image-folder ./playground/data/eval/mm-vet/images \
 7 |     --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | mkdir -p ./playground/data/eval/mm-vet/results
12 | 
13 | python scripts/convert_mmvet_for_eval.py \
14 |     --src ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
15 |     --dst ./playground/data/eval/mm-vet/results/llava-v1.5-13b.json
16 | 
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
 6 |     --image-folder ./playground/data/eval/pope/val2014 \
 7 |     --answers-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python llava/eval/eval_pope.py \
12 |     --annotation-dir ./playground/data/eval/pope/coco \
13 |     --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
14 |     --result-file ./playground/data/eval/pope/answers/llava-v1.5-13b.jsonl
15 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" = "dev" ]; then
 4 |     echo "Evaluating in 'dev' split."
 5 | elif [ "$1" = "test" ]; then
 6 |     echo "Evaluating in 'test' split."
 7 | else
 8 |     echo "Unknown split, please choose between 'dev' and 'test'."
 9 |     exit 1
10 | fi
11 | 
12 | python -m llava.eval.model_vqa_qbench \
13 |     --model-path liuhaotian/llava-v1.5-13b \
14 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
15 |     --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \
16 |     --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \
17 |     --conv-mode llava_v1 \
18 |     --lang en
19 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench_zh.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$1" = "dev" ]; then
 4 |     ZH_SPLIT="验证集"
 5 |     echo "Evaluating in 'dev' split."
 6 | elif [ "$1" = "test" ]; then
 7 |     ZH_SPLIT="测试集"
 8 |     echo "Evaluating in 'test' split."
 9 | else
10 |     echo "Unknown split, please choose between 'dev' and 'test'."
11 |     exit 1
12 | fi
13 | 
14 | python -m llava.eval.model_vqa_qbench \
15 |     --model-path liuhaotian/llava-v1.5-13b \
16 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
17 |     --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \
18 |     --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \
19 |     --conv-mode llava_v1 \
20 |     --lang zh
21 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-v1.5-13b"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
12 |         --model-path liuhaotian/llava-v1.5-13b \
13 |         --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \
14 |         --image-folder ./playground/data/eval/seed_bench \
15 |         --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |         --num-chunks $CHUNKS \
17 |         --chunk-idx $IDX \
18 |         --temperature 0 \
19 |         --conv-mode vicuna_v1 &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl
25 | 
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 | 
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | # Evaluate
35 | python scripts/convert_seed_for_submission.py \
36 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
37 |     --result-file $output_file \
38 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl
39 | 
40 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_science \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
 6 |     --image-folder ./playground/data/eval/scienceqa/images/test \
 7 |     --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
 8 |     --single-pred-prompt \
 9 |     --temperature 0 \
10 |     --conv-mode vicuna_v1
11 | 
12 | python llava/eval/eval_science_qa.py \
13 |     --base-dir ./playground/data/eval/scienceqa \
14 |     --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
15 |     --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \
16 |     --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json
17 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
 6 |     --image-folder ./playground/data/eval/textvqa/train_images \
 7 |     --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python -m llava.eval.eval_textvqa \
12 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
13 |     --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl
14 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python -m llava.eval.model_vqa_loader \
 4 |     --model-path liuhaotian/llava-v1.5-13b \
 5 |     --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
 6 |     --image-folder ./playground/data/eval/vizwiz/test \
 7 |     --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
 8 |     --temperature 0 \
 9 |     --conv-mode vicuna_v1
10 | 
11 | python scripts/convert_vizwiz_for_submission.py \
12 |     --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
13 |     --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
14 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json
15 | 


--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="llava-v1.5-13b"
 9 | SPLIT="llava_vqav2_mscoco_test-dev2015"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
13 |         --model-path liuhaotian/llava-v1.5-13b \
14 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 |         --image-folder ./playground/data/eval/vqav2/test2015 \
16 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
36 | 
37 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-13b \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 16 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 1 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_aya_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path CohereForAI/aya-23-8B \
 6 |     --version aya \
 7 |     --data_path /dev/data/annotations/palo_multilingual_dataset.json \
 8 |     --image_folder /dev/data/instruction_tune_dataset/ \
 9 |     --vision_tower google/siglip-base-patch16-256-multilingual \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-aya-23-8b-siglip-multi-8lang-32bz-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/maya \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 4 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 4 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_llava_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-13b \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 16 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 1 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_llava_siglip2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-13b \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 16 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 1 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-5 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 5 |     --deepspeed ./scripts/zero3.json \
 6 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 7 |     --version v1 \
 8 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 9 |     --image_folder ./playground/data \
10 |     --vision_tower openai/clip-vit-large-patch14-336 \
11 |     --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
12 |     --mm_projector_type mlp2x_gelu \
13 |     --mm_vision_select_layer -2 \
14 |     --mm_use_im_start_end False \
15 |     --mm_use_im_patch_token False \
16 |     --image_aspect_ratio pad \
17 |     --group_by_modality_length True \
18 |     --bf16 True \
19 |     --output_dir ./checkpoints/llava-v1.5-13b-lora \
20 |     --num_train_epochs 1 \
21 |     --per_device_train_batch_size 16 \
22 |     --per_device_eval_batch_size 4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --evaluation_strategy "no" \
25 |     --save_strategy "steps" \
26 |     --save_steps 50000 \
27 |     --save_total_limit 1 \
28 |     --learning_rate 2e-4 \
29 |     --weight_decay 0. \
30 |     --warmup_ratio 0.03 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --tf32 True \
34 |     --model_max_length 2048 \
35 |     --gradient_checkpointing True \
36 |     --dataloader_num_workers 4 \
37 |     --lazy_preprocess True \
38 |     --report_to wandb
39 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero3.json \
 5 |     --model_name_or_path liuhaotian/llava-v1.5-13b \
 6 |     --version v1 \
 7 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 8 |     --image_folder ./playground/data \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --mm_vision_select_layer -2 \
12 |     --mm_use_im_start_end False \
13 |     --mm_use_im_patch_token False \
14 |     --image_aspect_ratio pad \
15 |     --group_by_modality_length True \
16 |     --bf16 True \
17 |     --output_dir ./checkpoints/llava-v1.5-13b-task \
18 |     --num_train_epochs 1 \
19 |     --per_device_train_batch_size 16 \
20 |     --per_device_eval_batch_size 4 \
21 |     --gradient_accumulation_steps 1 \
22 |     --evaluation_strategy "no" \
23 |     --save_strategy "steps" \
24 |     --save_steps 50000 \
25 |     --save_total_limit 1 \
26 |     --learning_rate 2e-5 \
27 |     --weight_decay 0. \
28 |     --warmup_ratio 0.03 \
29 |     --lr_scheduler_type "cosine" \
30 |     --logging_steps 1 \
31 |     --tf32 True \
32 |     --model_max_length 2048 \
33 |     --gradient_checkpointing True \
34 |     --dataloader_num_workers 4 \
35 |     --lazy_preprocess True \
36 |     --report_to wandb
37 | 


--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
 5 |     --deepspeed ./scripts/zero3.json \
 6 |     --model_name_or_path liuhaotian/llava-v1.5-13b \
 7 |     --version v1 \
 8 |     --data_path ./playground/data/llava_v1_5_mix665k.json \
 9 |     --image_folder ./playground/data \
10 |     --vision_tower openai/clip-vit-large-patch14-336 \
11 |     --mm_projector_type mlp2x_gelu \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --image_aspect_ratio pad \
16 |     --group_by_modality_length True \
17 |     --bf16 True \
18 |     --output_dir ./checkpoints/llava-v1.5-13b-task-lora \
19 |     --num_train_epochs 1 \
20 |     --per_device_train_batch_size 16 \
21 |     --per_device_eval_batch_size 4 \
22 |     --gradient_accumulation_steps 1 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 50000 \
26 |     --save_total_limit 1 \
27 |     --learning_rate 2e-4 \
28 |     --weight_decay 0. \
29 |     --warmup_ratio 0.03 \
30 |     --lr_scheduler_type "cosine" \
31 |     --logging_steps 1 \
32 |     --tf32 True \
33 |     --model_max_length 2048 \
34 |     --gradient_checkpointing True \
35 |     --dataloader_num_workers 4 \
36 |     --lazy_preprocess True \
37 |     --report_to wandb
38 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-13b-v1.5 \
 6 |     --version plain \
 7 |     --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./playground/data/LLaVA-Pretrain/images \
 9 |     --vision_tower openai/clip-vit-large-patch14-336 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --tune_mm_mlp_adapter True \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --bf16 True \
16 |     --output_dir ./checkpoints/llava-v1.5-13b-pretrain \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 32 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 24000 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-3 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --dataloader_num_workers 4 \
34 |     --lazy_preprocess True \
35 |     --report_to wandb
36 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_aya.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | MODEL_VERSION=aya-23-8b
 5 | 
 6 | ########### DO NOT CHANGE ###########
 7 | ########### USE THIS FOR BOTH ###########
 8 | PROMPT_VERSION=plain
 9 | ########### DO NOT CHANGE ###########
10 | 
11 | deepspeed llava/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path CohereForAI/aya-23-8B \
14 |     --version $PROMPT_VERSION \
15 |     --data_path /home/ubuntu/LLaVA/playground/data/LLaVA-Pretrain/ \
16 |     --image_folder /home/ubuntu/LLaVA/playground/data/LLaVA-Pretrain/images \
17 |     --vision_tower openai/clip-vit-large-patch14-336 \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --bf16 True \
24 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 8 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 1 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_aya_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | MODEL_VERSION=aya-23-8b-siglip-multi
 5 | 
 6 | ########### DO NOT CHANGE ###########
 7 | ########### USE THIS FOR BOTH ###########
 8 | PROMPT_VERSION=plain
 9 | ########### DO NOT CHANGE ###########
10 | 
11 | deepspeed llava/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path CohereForAI/aya-23-8B \
14 |     --version $PROMPT_VERSION \
15 |     --data_path /dev/data/LLaVA_Pretrain \
16 |     --image_folder /dev/data/images \
17 |     --vision_tower google/siglip-base-patch16-256-multilingual \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --bf16 True \
24 |     --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 4 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 8 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_llava_siglip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-7b-v1.5 \
 6 |     --version plain \
 7 |     --data_path ./dev/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./dev/data/images \
 9 |     --vision_tower google/siglip-base-patch16-256 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --tune_mm_mlp_adapter True \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --bf16 True \
16 |     --output_dir ./checkpoints/llava-v1.5-7b-pretrain-siglip-base-patch16-256 \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 32 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 24000 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-3 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --dataloader_num_workers 4 \
34 |     --lazy_preprocess True \
35 |     --report_to wandb
36 | 


--------------------------------------------------------------------------------
/scripts/v1_5/pretrain_llava_siglip2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | deepspeed llava/train/train_mem.py \
 4 |     --deepspeed ./scripts/zero2.json \
 5 |     --model_name_or_path lmsys/vicuna-7b-v1.5 \
 6 |     --version plain \
 7 |     --data_path ./dev/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
 8 |     --image_folder ./dev/data/images \
 9 |     --vision_tower google/siglip2-base-patch16-256 \
10 |     --mm_projector_type mlp2x_gelu \
11 |     --tune_mm_mlp_adapter True \
12 |     --mm_vision_select_layer -2 \
13 |     --mm_use_im_start_end False \
14 |     --mm_use_im_patch_token False \
15 |     --bf16 True \
16 |     --output_dir ./checkpoints/llava-v1.5-7b-pretrain-siglip2-base-patch-16 \
17 |     --num_train_epochs 1 \
18 |     --per_device_train_batch_size 32 \
19 |     --per_device_eval_batch_size 4 \
20 |     --gradient_accumulation_steps 1 \
21 |     --evaluation_strategy "no" \
22 |     --save_strategy "steps" \
23 |     --save_steps 24000 \
24 |     --save_total_limit 1 \
25 |     --learning_rate 1e-3 \
26 |     --weight_decay 0. \
27 |     --warmup_ratio 0.03 \
28 |     --lr_scheduler_type "cosine" \
29 |     --logging_steps 1 \
30 |     --tf32 True \
31 |     --model_max_length 2048 \
32 |     --gradient_checkpointing True \
33 |     --dataloader_num_workers 4 \
34 |     --lazy_preprocess True \
35 |     --report_to wandb
36 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------