├── LICENSE ├── README.md ├── cog.yaml ├── hostfile ├── hostfile_4 ├── images ├── demo_gen.png ├── demo_und.png ├── pipeline.png └── teaser.png ├── mgm ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── MMMU │ │ ├── LICENSE │ │ ├── README.md │ │ ├── eval │ │ │ ├── README.md │ │ │ ├── answer_dict_val.json │ │ │ ├── configs │ │ │ │ └── llava1.5.yaml │ │ │ ├── convert_to_test.py │ │ │ ├── eval.py │ │ │ ├── example_outputs │ │ │ │ ├── llava1.5_13b │ │ │ │ │ ├── Accounting │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Agriculture │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Architecture_and_Engineering │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Art │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Art_Theory │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Basic_Medical_Science │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Biology │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Chemistry │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Clinical_Medicine │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Computer_Science │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Design │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Diagnostics_and_Laboratory_Medicine │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Economics │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Electronics │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Energy_and_Power │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Finance │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Geography │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── History │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Literature │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Manage │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Marketing │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Materials │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Math │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Mechanical_Engineering │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Music │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Pharmacy │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Physics │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Psychology │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Public_Health │ │ │ │ │ │ └── output.json │ │ │ │ │ ├── Sociology │ │ │ │ │ │ └── output.json │ │ │ │ │ └── total_val_output.json │ │ │ │ ├── llava1.5_13b_val.json │ │ │ │ └── qwen_vl │ │ │ │ │ ├── Accounting │ │ │ │ │ └── output.json │ │ │ │ │ ├── Agriculture │ │ │ │ │ └── output.json │ │ │ │ │ ├── Architecture_and_Engineering │ │ │ │ │ └── output.json │ │ │ │ │ ├── Art │ │ │ │ │ └── output.json │ │ │ │ │ ├── Art_Theory │ │ │ │ │ └── output.json │ │ │ │ │ ├── Basic_Medical_Science │ │ │ │ │ └── output.json │ │ │ │ │ ├── Biology │ │ │ │ │ └── output.json │ │ │ │ │ ├── Chemistry │ │ │ │ │ └── output.json │ │ │ │ │ ├── Clinical_Medicine │ │ │ │ │ └── output.json │ │ │ │ │ ├── Computer_Science │ │ │ │ │ └── output.json │ │ │ │ │ ├── Design │ │ │ │ │ └── output.json │ │ │ │ │ ├── Diagnostics_and_Laboratory_Medicine │ │ │ │ │ └── output.json │ │ │ │ │ ├── Economics │ │ │ │ │ └── output.json │ │ │ │ │ ├── Electronics │ │ │ │ │ └── output.json │ │ │ │ │ ├── Energy_and_Power │ │ │ │ │ └── output.json │ │ │ │ │ ├── Finance │ │ │ │ │ └── output.json │ │ │ │ │ ├── Geography │ │ │ │ │ └── output.json │ │ │ │ │ ├── History │ │ │ │ │ └── output.json │ │ │ │ │ ├── Literature │ │ │ │ │ └── output.json │ │ │ │ │ ├── Manage │ │ │ │ │ └── output.json │ │ │ │ │ ├── Marketing │ │ │ │ │ └── output.json │ │ │ │ │ ├── Materials │ │ │ │ │ └── output.json │ │ │ │ │ ├── Math │ │ │ │ │ └── output.json │ │ │ │ │ ├── Mechanical_Engineering │ │ │ │ │ └── output.json │ │ │ │ │ ├── Music │ │ │ │ │ └── output.json │ │ │ │ │ ├── Pharmacy │ │ │ │ │ └── output.json │ │ │ │ │ ├── Physics │ │ │ │ │ └── output.json │ │ │ │ │ ├── Psychology │ │ │ │ │ └── output.json │ │ │ │ │ ├── Public_Health │ │ │ │ │ └── output.json │ │ │ │ │ ├── Sociology │ │ │ │ │ └── output.json │ │ │ │ │ └── total_val_output.json │ │ │ ├── main_eval_only.py │ │ │ ├── main_parse_and_eval.py │ │ │ ├── print_results.py │ │ │ ├── run_llava.py │ │ │ └── utils │ │ │ │ ├── data_utils.py │ │ │ │ ├── eval_utils.py │ │ │ │ └── model_utils.py │ │ └── image.png │ ├── MathVista │ │ ├── calculate_score.py │ │ ├── extract_answer.py │ │ ├── prompts │ │ │ └── ext_ans.py │ │ └── utilities.py │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_math_vista.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_qbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ ├── summarize_gpt_review.py │ └── webpage │ │ ├── figures │ │ ├── alpaca.png │ │ ├── bard.jpg │ │ ├── chatgpt.svg │ │ ├── llama.jpg │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ └── vicuna.jpeg │ │ ├── index.html │ │ ├── script.js │ │ └── styles.css ├── mm_utils.py ├── model │ ├── __init__.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── mgm_gemma.py │ │ ├── mgm_llama.py │ │ ├── mgm_mistral.py │ │ └── mgm_mixtral.py │ ├── llava_arch.py │ ├── mgm_arch.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ ├── eva_encoder.py │ │ └── openclip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── processor │ │ └── video_processor.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ ├── monday.jpg │ │ ├── waterview.jpg │ │ └── woolen.png │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llama_xformers_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── train.py │ ├── train_mem.py │ └── train_xformers.py └── utils.py ├── predict.py ├── pyproject.toml └── scripts ├── convert_gqa_for_eval.py ├── convert_mmbench_for_submission.py ├── convert_mmvet_for_eval.py ├── convert_seed_for_submission.py ├── extract_mm_projector.py ├── gemma ├── eval │ ├── math_vista.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmmu_test.sh │ ├── mmvet.sh │ └── textvqa.sh └── train │ ├── stage_1_2_full_gemma_v2b_336_hr_768.sh │ └── stage_2_full_gemma_v2b_672_hr_1536.sh ├── llama ├── eval │ ├── math_vista.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmmu_test.sh │ ├── mmvet.sh │ └── textvqa.sh └── train │ ├── stage_1_2_full_v13b_336_hr_768.sh │ ├── stage_1_2_full_v7b_336_hr_768.sh │ ├── stage_1_2_full_v7b_336_hr_768_nodp.sh │ ├── stage_2_full_v13b_672_hr_1536.sh │ └── stage_2_full_v7b_672_hr_1536.sh ├── llama3 ├── eval │ ├── math_vista.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmmu_test.sh │ ├── mmvet.sh │ └── textvqa.sh └── train │ ├── stage_1_2_full_v8b_336_hr_768.sh │ └── stage_2_full_v8b_672_hr_1536.sh ├── merge_lora_weights.py ├── mixtral ├── eval │ ├── math_vista.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmmu_test.sh │ ├── mmvet.sh │ └── textvqa.sh └── train │ ├── stage_1_2_full_mixtral_8x7b_336_hr_768.sh │ └── stage_2_full_mixtral_8x7b_672_hr_1536.sh ├── yi ├── eval │ ├── math_vista.sh │ ├── mmbench.sh │ ├── mme.sh │ ├── mmmu.sh │ ├── mmmu_test.sh │ ├── mmvet.sh │ └── textvqa.sh └── train │ ├── stage_1_2_full_yi34b_336_hr_768.sh │ └── stage_2_full_yi34b_672_hr_1536.sh ├── zero2.json ├── zero2_offload.json └── zero3.json /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /hostfile: -------------------------------------------------------------------------------- 1 | your_ip_0 slots=8 2 | your_ip_1 slots=8 3 | -------------------------------------------------------------------------------- /hostfile_4: -------------------------------------------------------------------------------- 1 | your_ip_0 slots=8 2 | your_ip_1 slots=8 3 | your_ip_2 slots=8 4 | your_ip_3 slots=8 -------------------------------------------------------------------------------- /images/demo_gen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/demo_gen.png -------------------------------------------------------------------------------- /images/demo_und.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/demo_und.png -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/pipeline.png -------------------------------------------------------------------------------- /images/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/teaser.png -------------------------------------------------------------------------------- /mgm/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import MGMLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /mgm/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | PREDICT_TOKEN_INDEX = -300 10 | DEFAULT_IMAGE_TOKEN = "" 11 | DEFAULT_IMAGE_PATCH_TOKEN = "" 12 | DEFAULT_IM_START_TOKEN = "" 13 | DEFAULT_IM_END_TOKEN = "" 14 | IMAGE_PLACEHOLDER = "" 15 | DEFAULT_PREDICT_TOKEN = "" 16 | 17 | DESCRIPT_PROMPT = [ 18 | "Describe this image thoroughly.", 19 | "Provide a detailed description in this picture.", 20 | "Detail every aspect of what's in this picture.", 21 | "Explain this image with precision and detail.", 22 | "Give a comprehensive description of this visual.", 23 | "Elaborate on the specifics within this image.", 24 | "Offer a detailed account of this picture's contents.", 25 | "Describe in detail what this image portrays.", 26 | "Break down this image into detailed descriptions.", 27 | "Provide a thorough description of the elements in this image."] -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation Guidelines 2 | We provide detailed instructions for evaluation. 3 | To execute our evaluation script, please ensure that the structure of your model outputs is the same as ours. 4 | 5 | We provide two options: 6 | 1. Evaluation only: you can parse the response on your own and simply provide one file with all the final predictions. 7 | 2. Parse and evaluation: you can leave all the responses to us with the output formats shown below. 8 | 9 | ## Evaluation Only 10 | If you want to use your own parsing logic and *only provide the final answer*, you can use `main_eval_only.py`. 11 | 12 | You can provide all the outputs in *one file* in the following format: 13 | 14 | ``` 15 | { 16 | "validation_Accounting_1": "D", # strictly "A", "B", "C", "D" for multi-choice question 17 | "validation_Architecture_and_Engineering_14": "0.0", # any string response for open question. 18 | ... 19 | } 20 | ``` 21 | Then run eval_only with: 22 | ``` 23 | python main_eval_only.py --output_path ./example_outputs/llava1.5_13b/total_val_output.json 24 | ``` 25 | 26 | Please refer to [example output](https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/example_outputs/llava1.5_13b/total_val_output.json) for a detailed prediction file form. 27 | 28 | 29 | ## Parse and Evaluation 30 | You can also provide response and run the `main_parse_and_eval.py` to use our answer parsing processing and evaluation pipeline as follows: 31 | 32 | ### Output folder structure 33 | 34 | ``` 35 | └── model_name 36 | ├── category_name (e.g., Accounting) 37 | │ ├── output.json 38 | └── category_name (e.g., Electronics) 39 | ├── output.json 40 | ... 41 | ``` 42 | 43 | ### Output file 44 | Each `output.json`` has a list of dict containing instances for evaluation (). 45 | ``` 46 | [ 47 | { 48 | "id": "validation_Electronics_28", 49 | "question_type": "multiple-choice", 50 | "answer": "A", # given answer 51 | "all_choices": [ # create using `get_multi_choice_info` in 52 | "A", 53 | "B", 54 | "C", 55 | "D" 56 | ], 57 | "index2ans": { # create using `get_multi_choice_info` in 58 | "A": "75 + 13.3 cos(250t - 57.7°)V", 59 | "B": "75 + 23.3 cos(250t - 57.7°)V", 60 | "C": "45 + 3.3 cos(250t - 57.7°)V", 61 | "D": "95 + 13.3 cos(250t - 57.7°)V" 62 | }, 63 | "response": "B" # model response 64 | }, 65 | { 66 | "id": "validation_Electronics_29", 67 | "question_type": "short-answer", 68 | "answer": "30", # given answer 69 | "response": "36 watts" # model response 70 | }, 71 | ... 72 | ] 73 | ``` 74 | 75 | ### Evaluation 76 | ``` 77 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject ALL # all subject 78 | 79 | # OR you can sepecify one subject for the evaluation 80 | 81 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject elec # short name for Electronics. use --help for all short names 82 | 83 | ``` 84 | 85 | `main_parse_and_eval.py` will generate `parsed_output.json` and `result.json` in the subfolder under the same category with output.json, respectively. 86 | 87 | ``` 88 | ├── Accounting 89 | │ ├── output.json 90 | │ ├── parsed_output.json 91 | │ └── result.json 92 | └── Electronics 93 | ├── output.json 94 | ├── parsed_output.json 95 | └── result.json 96 | ... 97 | ``` 98 | 99 | ### Print Results 100 | You can print results locally if you want. (use `pip install tabulate` if you haven't) 101 | ``` 102 | python print_results.py --path ./example_outputs/llava1.5_13b 103 | # Results may be slightly different due to the ramdon selection for fail response 104 | ``` 105 | 106 | 107 | 108 | ##### Run Llava 109 | In case if you want to reproduce the results of some of the models, please go check run_llava.py as an example. 110 | 111 | By seeting up the env following the [llava official repo](https://github.com/haotian-liu/LLaVA) and installing `datasets` packages by huggingface, you can run llava viathe following command: 112 | 113 | ``` 114 | CUDA_VISIBLE_DEVICES=0 nohup python run_llava.py \ 115 | --output_path example_outputs/llava1.5_13b_val.json \ 116 | --model_path liuhaotian/llava-v1.5-13b \ 117 | --config_path configs/llava1.5.yaml 118 | ``` 119 | 120 | Then you can evaluate the results via the very first pipeline. 121 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/configs/llava1.5.yaml: -------------------------------------------------------------------------------- 1 | task_instructions: 2 | - "" 3 | multi_choice_example_format: 4 | - "{} 5 | 6 | {} 7 | 8 | Answer with the option's letter from the given choices directly." 9 | 10 | short_ans_example_format: 11 | - "{} 12 | 13 | Answer the question using a single word or phrase." 14 | temperature: 15 | - 0 -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/convert_to_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import ArgumentParser 4 | 5 | from utils.eval_utils import evaluate 6 | from utils.data_utils import save_json 7 | 8 | 9 | def main(): 10 | parser = ArgumentParser() 11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt', 12 | help='name of saved json') 13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json', 14 | help='name of saved json') 15 | 16 | args = parser.parse_args() 17 | out_samples = [json.loads(line) for line in open(args.result_file)] 18 | out_json = {} 19 | for _sample in out_samples: 20 | _result = _sample['parsed_pred'] 21 | if isinstance(_result, list): 22 | _result = str(_result[0]) 23 | out_json[_sample['id']] = _result 24 | 25 | save_json(args.output_path, out_json) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import ArgumentParser 4 | 5 | from utils.eval_utils import evaluate 6 | from utils.data_utils import save_json 7 | 8 | 9 | def main(): 10 | parser = ArgumentParser() 11 | parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt', 12 | help='name of saved json') 13 | parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json', 14 | help='name of saved json') 15 | 16 | args = parser.parse_args() 17 | out_samples = [json.loads(line) for line in open(args.result_file)] 18 | 19 | judge_dict, metric_dict = evaluate(out_samples) 20 | metric_dict.update({"num_example": len(out_samples)}) 21 | judge_dict['metric_dict'] = metric_dict 22 | save_dir = '/'.join(args.output_path.split('/')[:-1]) 23 | if not os.path.exists(save_dir): 24 | os.makedirs(save_dir) 25 | save_json(args.output_path, judge_dict) 26 | 27 | print(metric_dict) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/main_eval_only.py: -------------------------------------------------------------------------------- 1 | """Parse and Evalate""" 2 | import os 3 | import json 4 | 5 | import pdb 6 | from argparse import ArgumentParser 7 | 8 | from utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT 9 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = ArgumentParser() 15 | parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.") 16 | parser.add_argument('--answer_path', type=str, default="./answer_dict_val.json", help="Answer file path.") 17 | args = parser.parse_args() 18 | 19 | output_dict = json.load(open(args.output_path)) 20 | answer_dict = json.load(open(args.answer_path)) 21 | 22 | # group by category 23 | output_dict_w_cat = {} 24 | for data_id, parsed_pred in output_dict.items(): 25 | category = "_".join(data_id.split("_")[1:-1]) 26 | if category not in output_dict_w_cat: 27 | output_dict_w_cat.update({category: {}}) 28 | output_dict_w_cat[category].update({data_id: parsed_pred}) 29 | 30 | # group by category 31 | answer_dict_w_cat = {} 32 | for data_id, parsed_pred in answer_dict.items(): 33 | category = "_".join(data_id.split("_")[1:-1]) 34 | if category not in answer_dict_w_cat: 35 | answer_dict_w_cat.update({category: {}}) 36 | answer_dict_w_cat[category].update({data_id: parsed_pred}) 37 | 38 | evaluation_result = {} 39 | 40 | for category in CAT_SHORT2LONG.values(): 41 | print("Evaluating: {}".format(category)) 42 | # get cat_outputs and cat_answers 43 | try: 44 | cat_outputs = output_dict_w_cat[category] 45 | cat_answers = answer_dict_w_cat[category] 46 | except KeyError: 47 | print("Skipping {} for not found".format(category)) 48 | continue 49 | 50 | exampels_to_eval = [] 51 | for data_id, parsed_pred in cat_outputs.items(): 52 | question_type = cat_answers[data_id]['question_type'] 53 | if question_type != 'multiple-choice': 54 | parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.) 55 | else: 56 | parsed_pred = parsed_pred 57 | 58 | exampels_to_eval.append({ 59 | "id": data_id, 60 | "question_type": question_type, 61 | "answer": cat_answers[data_id]['ground_truth'], 62 | "parsed_pred": parsed_pred 63 | }) 64 | 65 | judge_dict, metric_dict = evaluate(exampels_to_eval) 66 | metric_dict.update({"num_example": len(exampels_to_eval)}) 67 | 68 | evaluation_result[category] = metric_dict 69 | 70 | printable_results = {} 71 | # pdb.set_trace() 72 | # add domain Subject 73 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items(): 74 | in_domain_cat_results = {} 75 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT 76 | if cat_name in evaluation_result.keys(): 77 | in_domain_cat_results[cat_name] = evaluation_result[cat_name] 78 | else: 79 | pass 80 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results) 81 | in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()]) 82 | printable_results['Overall-' + domain] = {"num": int(in_domain_data_num), 83 | "acc": round(in_domain_ins_acc, 3) 84 | } 85 | # add sub category 86 | for cat_name, cat_results in in_domain_cat_results.items(): 87 | printable_results[cat_name] = {"num": int(cat_results['num_example']), 88 | "acc": round(cat_results['acc'], 3) 89 | } 90 | 91 | # table.append(["-----------------------------", "-----", "----"]) 92 | all_ins_acc = calculate_ins_level_acc(evaluation_result) 93 | printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]), 94 | "acc": round(all_ins_acc, 3) 95 | } 96 | 97 | print(printable_results) 98 | 99 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/main_parse_and_eval.py: -------------------------------------------------------------------------------- 1 | """Parse and Evalate""" 2 | import os 3 | import json 4 | from argparse import ArgumentParser 5 | 6 | from utils.data_utils import save_json, CAT_SHORT2LONG 7 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response 8 | 9 | 10 | if __name__ == '__main__': 11 | 12 | parser = ArgumentParser() 13 | parser.add_argument('--path', type=str, default="./example_outputs/llava1.5_13b", help="The path to model output directory.") 14 | parser.add_argument('--subject', nargs='+', 15 | help=f'The name of the mmmu sub-category. Availble: {CAT_SHORT2LONG.keys()} or ALL') 16 | 17 | args = parser.parse_args() 18 | if args.subject[0] == 'ALL': 19 | args.subject = CAT_SHORT2LONG.keys() 20 | 21 | ex_output_path = os.path.join(args.path) 22 | 23 | all_results = {} 24 | for cat_short in args.subject: 25 | category = CAT_SHORT2LONG[cat_short] 26 | print("Evaluating: {}".format(category)) 27 | if category not in os.listdir(ex_output_path): 28 | print("Skipping {} for not found".format(category)) 29 | else: 30 | cat_folder_path = os.path.join(ex_output_path, category) 31 | cat_outputs = json.load(open(os.path.join(cat_folder_path, 'output.json'))) 32 | # Evaluation 33 | eval_samples = [] 34 | for cat_output in cat_outputs: 35 | response = cat_output['response'] 36 | if cat_output['question_type'] == 'multiple-choice': 37 | all_choices = cat_output['all_choices'] 38 | index2ans = cat_output['index2ans'] 39 | parsed_pred = parse_multi_choice_response(response, all_choices, index2ans) 40 | eval_samples.append( 41 | { 42 | 'id': cat_output['id'], 43 | 'question_type': cat_output['question_type'], 44 | 'answer': cat_output['answer'], # the content in option, not answer index. 45 | 'response': response, 46 | 'parsed_pred': parsed_pred, 47 | 'index2ans': index2ans, 48 | } 49 | ) 50 | else: # open 51 | parsed_pred = parse_open_response(response) 52 | eval_samples.append( 53 | { 54 | 'id': cat_output['id'], 55 | 'question_type': cat_output['question_type'], 56 | 'answer': cat_output['answer'], 57 | 'response': response, 58 | 'parsed_pred': parsed_pred, 59 | } 60 | ) 61 | 62 | print("Num of valid samples: {}, Expected Num: {}".format(len(eval_samples), len(cat_outputs))) 63 | 64 | judge_dict, metric_dict = evaluate(eval_samples) 65 | metric_dict.update({"num_example": len(eval_samples)}) 66 | for eval_sample in eval_samples: 67 | eval_sample.update({"judge": judge_dict[eval_sample['id']]}) 68 | 69 | save_json(os.path.join(cat_folder_path, 'parsed_output.json'), eval_samples) 70 | save_json(os.path.join(cat_folder_path, 'result.json'), metric_dict) 71 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/print_results.py: -------------------------------------------------------------------------------- 1 | # Beautiful table to print results of all categories 2 | 3 | import os 4 | from typing import Dict 5 | import json 6 | import numpy as np 7 | from tabulate import tabulate 8 | 9 | from argparse import ArgumentParser 10 | 11 | from utils.data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT 12 | 13 | from utils.eval_utils import calculate_ins_level_acc 14 | 15 | def main(): 16 | parser = ArgumentParser() 17 | parser.add_argument('--path', type=str, default="./example_outputs/blip2_flant5xxl", help="The path to output directory.") 18 | args = parser.parse_args() 19 | 20 | # load all results 21 | all_results = {} 22 | for cat_folder_name in os.listdir(args.path): 23 | if cat_folder_name in CAT_SHORT2LONG.values(): 24 | cat_folder_path = os.path.join(args.path, cat_folder_name) 25 | result_path = os.path.join(cat_folder_path, 'result.json') 26 | if os.path.exists(result_path): 27 | cat_results = json.load(open(result_path)) 28 | all_results[cat_folder_name] = cat_results 29 | 30 | # print results 31 | headers = ['Subject', 'Data Num', 'Acc'] 32 | table = [] 33 | 34 | # add domain Subject 35 | for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items(): 36 | in_domain_cat_results = {} 37 | for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT 38 | if cat_name in all_results.keys(): 39 | in_domain_cat_results[cat_name] = all_results[cat_name] 40 | else: 41 | pass 42 | in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results) 43 | in_domain_data_num = np.sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()]) 44 | table.append(['Overall-' + domain, int(in_domain_data_num), round(in_domain_ins_acc, 3)]) 45 | # add sub category 46 | for cat_name, cat_results in in_domain_cat_results.items(): 47 | table.append([cat_name, int(cat_results['num_example']), round(cat_results['acc'], 3)]) 48 | # table.append(["-----------------------------", "-----", "----"]) 49 | 50 | # table.append(["-----------------------------", "-----", "----"]) 51 | all_ins_acc = calculate_ins_level_acc(all_results) 52 | table.append(['Overall', np.sum([cat_results['num_example'] for cat_results in all_results.values()]), round(all_ins_acc, 3)]) 53 | 54 | print(tabulate(table, headers=headers, tablefmt='orgtbl')) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/eval/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | import torch 3 | 4 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None): 5 | from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 6 | from mgm.conversation import conv_templates, SeparatorStyle 7 | 8 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 9 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 10 | 11 | def insert_separator(X, sep): 12 | return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] 13 | 14 | input_ids = [] 15 | offset = 0 16 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 17 | offset = 1 18 | input_ids.append(prompt_chunks[0][0]) 19 | 20 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 21 | input_ids.extend(x[offset:]) 22 | 23 | if return_tensors is not None: 24 | if return_tensors == 'pt': 25 | return torch.tensor(input_ids, dtype=torch.long) 26 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 27 | return input_ids 28 | 29 | def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens): 30 | if ocr_tokens is not None: 31 | qs = input_text + '\n' + ocr_tokens 32 | else: 33 | qs = input_text 34 | if mm_use_im_start_end: 35 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 36 | else: 37 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 38 | return qs 39 | 40 | prompt = sample['final_input_prompt'] 41 | ocr_tokens = sample.get('ocr', None) 42 | prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens) 43 | conv = conv_templates[args.conv_mode].copy() 44 | conv.append_message(conv.roles[0], prompt) 45 | conv.append_message(conv.roles[1], None) 46 | prompt = conv.get_prompt() 47 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 48 | image = sample['image'] 49 | image_aux = sample['image_aux'] 50 | if image_aux is not None: 51 | image_aux = image_aux.unsqueeze(0).half().cuda() 52 | 53 | terminators = tokenizer.eos_token_id 54 | if "llama_3" in args.conv_mode: 55 | terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")] 56 | 57 | if image is not None: 58 | output_ids = model.generate( 59 | input_ids, 60 | images=image.unsqueeze(0).half().cuda(), 61 | images_aux=image_aux, 62 | do_sample=True, 63 | temperature=1, 64 | num_beams=5, 65 | top_p=None, 66 | max_new_tokens=128, 67 | bos_token_id=tokenizer.bos_token_id, # Begin of sequence token 68 | eos_token_id=terminators, # End of sequence token 69 | pad_token_id=tokenizer.pad_token_id, # Pad token 70 | use_cache=True) 71 | 72 | response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n') 73 | else: # multiple images actually 74 | if sample['question_type'] == 'multiple-choice': 75 | all_choices = sample['all_choices'] 76 | response = random.choice(all_choices) 77 | else: 78 | response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS' 79 | 80 | return response 81 | -------------------------------------------------------------------------------- /mgm/eval/MMMU/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/MMMU/image.png -------------------------------------------------------------------------------- /mgm/eval/MathVista/extract_answer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import argparse 5 | 6 | from tqdm import tqdm 7 | 8 | import sys 9 | sys.path.append('../') 10 | from utilities import * 11 | 12 | # OpenAI 13 | import openai 14 | 15 | # load demo prompt 16 | from prompts.ext_ans import demo_prompt 17 | 18 | 19 | def verify_extraction(extraction): 20 | extraction = extraction.strip() 21 | if extraction == "" or extraction == None: 22 | return False 23 | return True 24 | 25 | 26 | def create_test_prompt(demo_prompt, query, response): 27 | demo_prompt = demo_prompt.strip() 28 | test_prompt = f"{query}\n\n{response}" 29 | full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: " 30 | return full_prompt 31 | 32 | 33 | def extract_answer(response, problem, quick_extract=False): 34 | question_type = problem['question_type'] 35 | answer_type = problem['answer_type'] 36 | choices = problem['choices'] 37 | query = problem['query'] 38 | pid = problem['pid'] 39 | 40 | if response == "": 41 | return "" 42 | 43 | if question_type == 'multi_choice' and response in choices: 44 | return response 45 | 46 | if answer_type == "integer": 47 | try: 48 | extraction = int(response) 49 | return str(extraction) 50 | except: 51 | pass 52 | 53 | if answer_type == "float": 54 | try: 55 | extraction = str(float(response)) 56 | return extraction 57 | except: 58 | pass 59 | 60 | # quick extraction 61 | if quick_extract: 62 | print("Quickly extracting answer...") 63 | # The answer is "text". -> "text" 64 | try: 65 | result = re.search(r'The answer is "(.*)"\.', response) 66 | if result: 67 | extraction = result.group(1) 68 | return extraction 69 | except: 70 | pass 71 | 72 | # general extraction 73 | try: 74 | full_prompt = create_test_prompt(demo_prompt, query, response) 75 | extraction = get_chat_response(full_prompt, openai.api_key, openai.api_base, model=args.llm_engine) 76 | return extraction 77 | except Exception as e: 78 | print(e) 79 | print(f"Error in extracting answer for {pid}") 80 | 81 | return "" 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser() 86 | # input 87 | parser.add_argument('--output_file', type=str, default='answer.json') 88 | parser.add_argument('--response_label', type=str, default='response', help='response label for the input file') 89 | # model 90 | parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine', 91 | choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613']) 92 | parser.add_argument('--number', type=int, default=-1, help='number of problems to run') 93 | parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems') 94 | parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction') 95 | # openai 96 | parser.add_argument("--api_key", required=True, type=str, help="OpenAI API key") 97 | parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base") 98 | # output 99 | parser.add_argument('--save_every', type=int, default=10, help='save every n problems') 100 | parser.add_argument('--output_label', type=str, default='', help='label for the output file') 101 | args = parser.parse_args() 102 | 103 | # args 104 | label = args.response_label 105 | result_file = args.output_file 106 | if args.output_label != '': 107 | output_file = result_file.replace('.json', f'_{args.output_label}.json') 108 | else: 109 | output_file = result_file 110 | 111 | # read results 112 | print(f"Reading {result_file}...") 113 | try: 114 | results = read_json(output_file) 115 | except: 116 | samples = [json.loads(line) for line in open(result_file)] 117 | results = {} 118 | for sample in samples: 119 | results[sample['pid']] = sample 120 | 121 | # full pids 122 | full_pids = list(results.keys()) 123 | if args.number > 0: 124 | full_pids = full_pids[:min(args.number, len(full_pids))] 125 | print("Number of testing problems:", len(full_pids)) 126 | 127 | # test pids 128 | if args.rerun: 129 | test_pids = full_pids 130 | else: 131 | test_pids = [] 132 | for pid in full_pids: 133 | # print(pid) 134 | if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']): 135 | test_pids.append(pid) 136 | 137 | test_num = len(test_pids) 138 | print("Number of problems to run:", test_num) 139 | # print(test_pids) 140 | 141 | # openai api 142 | openai.api_key = args.api_key # Your API key here 143 | if args.api_base: 144 | openai.api_base = args.api_base # Your API base here 145 | 146 | # tqdm, enumerate results 147 | for i, pid in enumerate(tqdm(test_pids)): 148 | problem = results[pid] 149 | 150 | assert label in problem 151 | response = problem[label] 152 | 153 | 154 | extraction = extract_answer(response, problem, args.quick_extract) 155 | results[pid]['extraction'] = extraction 156 | 157 | if i % args.save_every == 0 or i == test_num - 1: 158 | print(f"Saving results to {output_file}...") 159 | save_json(results, output_file) 160 | print(f"Results saved.") 161 | -------------------------------------------------------------------------------- /mgm/eval/MathVista/prompts/ext_ans.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # pids = 852, 104, 824, 506, 540 4 | 5 | demo_prompt = """ 6 | Please read the following example. Then extract the answer from the model response and type it at the end of the prompt. 7 | 8 | Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end. 9 | Question: Which number is missing? 10 | 11 | Model response: The number missing in the sequence is 14. 12 | 13 | Extracted answer: 14 14 | 15 | Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end. 16 | Question: What is the fraction of females facing the camera? 17 | 18 | Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera. 19 | 20 | Extracted answer: 0.6 21 | 22 | Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end. 23 | Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $) 24 | 25 | Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy. 26 | 27 | Extracted answer: 1.45 28 | 29 | Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end. 30 | Question: Between which two years does the line graph saw its maximum peak? 31 | 32 | Model response: The line graph saw its maximum peak between 2007 and 2008. 33 | 34 | Extracted answer: [2007, 2008] 35 | 36 | Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end. 37 | Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5 38 | 39 | Model response: The correct answer is (B) 8/11. 40 | 41 | Extracted answer: B 42 | """ -------------------------------------------------------------------------------- /mgm/eval/eval_gpt_review.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import tqdm 7 | import ray 8 | import time 9 | 10 | NUM_SECONDS_TO_SLEEP = 3 11 | 12 | @ray.remote(num_cpus=4) 13 | def get_eval(content: str, max_tokens: int): 14 | while True: 15 | try: 16 | response = openai.ChatCompletion.create( 17 | model='gpt-4', 18 | messages=[{ 19 | 'role': 'system', 20 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 21 | }, { 22 | 'role': 'user', 23 | 'content': content, 24 | }], 25 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 26 | max_tokens=max_tokens, 27 | ) 28 | break 29 | except openai.error.RateLimitError: 30 | pass 31 | except Exception as e: 32 | print(e) 33 | time.sleep(NUM_SECONDS_TO_SLEEP) 34 | 35 | print('success!') 36 | return response['choices'][0]['message']['content'] 37 | 38 | 39 | def parse_score(review): 40 | try: 41 | score_pair = review.split('\n')[0] 42 | score_pair = score_pair.replace(',', ' ') 43 | sp = score_pair.split(' ') 44 | if len(sp) == 2: 45 | return [float(sp[0]), float(sp[1])] 46 | else: 47 | print('error', review) 48 | return [-1, -1] 49 | except Exception as e: 50 | print(e) 51 | print('error', review) 52 | return [-1, -1] 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 57 | parser.add_argument('-q', '--question') 58 | # parser.add_argument('-a', '--answer') 59 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 60 | parser.add_argument('-r', '--rule') 61 | parser.add_argument('-o', '--output') 62 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 63 | args = parser.parse_args() 64 | 65 | ray.init() 66 | 67 | f_q = open(os.path.expanduser(args.question)) 68 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 69 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 70 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 71 | 72 | review_file = open(f'{args.output}', 'w') 73 | 74 | js_list = [] 75 | handles = [] 76 | idx = 0 77 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 78 | # if idx == 1: 79 | # break 80 | 81 | ques = json.loads(ques_js) 82 | ans1 = json.loads(ans1_js) 83 | ans2 = json.loads(ans2_js) 84 | 85 | category = json.loads(ques_js)['category'] 86 | if category in rule_dict: 87 | rule = rule_dict[category] 88 | else: 89 | rule = rule_dict['default'] 90 | prompt = rule['prompt'] 91 | role = rule['role'] 92 | content = (f'[Question]\n{ques["text"]}\n\n' 93 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 94 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 95 | f'[System]\n{prompt}\n\n') 96 | js_list.append({ 97 | 'id': idx+1, 98 | 'question_id': ques['question_id'], 99 | 'answer1_id': ans1['answer_id'], 100 | 'answer2_id': ans2['answer_id'], 101 | 'category': category}) 102 | idx += 1 103 | handles.append(get_eval.remote(content, args.max_tokens)) 104 | # To avoid the rate limit set by OpenAI 105 | time.sleep(NUM_SECONDS_TO_SLEEP) 106 | 107 | reviews = ray.get(handles) 108 | for idx, review in enumerate(reviews): 109 | scores = parse_score(review) 110 | js_list[idx]['content'] = review 111 | js_list[idx]['tuple'] = scores 112 | review_file.write(json.dumps(js_list[idx]) + '\n') 113 | review_file.close() 114 | -------------------------------------------------------------------------------- /mgm/eval/eval_gpt_review_bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | 86 | if isinstance(inst['caption'], list): 87 | cap_str = '\n'.join(inst['caption']) 88 | else: 89 | cap_str = inst['caption'] 90 | 91 | category = 'llava_bench_' + json.loads(ques_js)['category'] 92 | if category in rule_dict: 93 | rule = rule_dict[category] 94 | else: 95 | assert False, f"Visual QA category not found in rule file: {category}." 96 | prompt = rule['prompt'] 97 | role = rule['role'] 98 | content = (f'[Context]\n{cap_str}\n\n' 99 | f'[Question]\n{ques["text"]}\n\n' 100 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 101 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 102 | f'[System]\n{prompt}\n\n') 103 | cur_js = { 104 | 'id': idx+1, 105 | 'question_id': ques['question_id'], 106 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 107 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 108 | 'category': category 109 | } 110 | if idx >= len(cur_reviews): 111 | review = get_eval(content, args.max_tokens) 112 | scores = parse_score(review) 113 | cur_js['content'] = review 114 | cur_js['tuple'] = scores 115 | review_file.write(json.dumps(cur_js) + '\n') 116 | review_file.flush() 117 | else: 118 | print(f'Skipping {idx} as we already have it.') 119 | idx += 1 120 | print(idx) 121 | review_file.close() 122 | -------------------------------------------------------------------------------- /mgm/eval/eval_gpt_review_visual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import openai 6 | import time 7 | 8 | NUM_SECONDS_TO_SLEEP = 0.5 9 | 10 | 11 | def get_eval(content: str, max_tokens: int): 12 | while True: 13 | try: 14 | response = openai.ChatCompletion.create( 15 | model='gpt-4-0314', 16 | messages=[{ 17 | 'role': 'system', 18 | 'content': 'You are a helpful and precise assistant for checking the quality of the answer.' 19 | }, { 20 | 'role': 'user', 21 | 'content': content, 22 | }], 23 | temperature=0.2, # TODO: figure out which temperature is best for evaluation 24 | max_tokens=max_tokens, 25 | ) 26 | break 27 | except openai.error.RateLimitError: 28 | pass 29 | except Exception as e: 30 | print(e) 31 | time.sleep(NUM_SECONDS_TO_SLEEP) 32 | 33 | return response['choices'][0]['message']['content'] 34 | 35 | 36 | def parse_score(review): 37 | try: 38 | score_pair = review.split('\n')[0] 39 | score_pair = score_pair.replace(',', ' ') 40 | sp = score_pair.split(' ') 41 | if len(sp) == 2: 42 | return [float(sp[0]), float(sp[1])] 43 | else: 44 | print('error', review) 45 | return [-1, -1] 46 | except Exception as e: 47 | print(e) 48 | print('error', review) 49 | return [-1, -1] 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 54 | parser.add_argument('-q', '--question') 55 | parser.add_argument('-c', '--context') 56 | parser.add_argument('-a', '--answer-list', nargs='+', default=[]) 57 | parser.add_argument('-r', '--rule') 58 | parser.add_argument('-o', '--output') 59 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 60 | args = parser.parse_args() 61 | 62 | f_q = open(os.path.expanduser(args.question)) 63 | f_ans1 = open(os.path.expanduser(args.answer_list[0])) 64 | f_ans2 = open(os.path.expanduser(args.answer_list[1])) 65 | rule_dict = json.load(open(os.path.expanduser(args.rule), 'r')) 66 | 67 | if os.path.isfile(os.path.expanduser(args.output)): 68 | cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))] 69 | else: 70 | cur_reviews = [] 71 | 72 | review_file = open(f'{args.output}', 'a') 73 | 74 | context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))] 75 | image_to_context = {context['image']: context for context in context_list} 76 | 77 | handles = [] 78 | idx = 0 79 | for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2): 80 | ques = json.loads(ques_js) 81 | ans1 = json.loads(ans1_js) 82 | ans2 = json.loads(ans2_js) 83 | 84 | inst = image_to_context[ques['image']] 85 | cap_str = '\n'.join(inst['captions']) 86 | box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']]) 87 | 88 | category = json.loads(ques_js)['category'] 89 | if category in rule_dict: 90 | rule = rule_dict[category] 91 | else: 92 | assert False, f"Visual QA category not found in rule file: {category}." 93 | prompt = rule['prompt'] 94 | role = rule['role'] 95 | content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n' 96 | f'[Question]\n{ques["text"]}\n\n' 97 | f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n' 98 | f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n' 99 | f'[System]\n{prompt}\n\n') 100 | cur_js = { 101 | 'id': idx+1, 102 | 'question_id': ques['question_id'], 103 | 'answer1_id': ans1.get('answer_id', ans1['question_id']), 104 | 'answer2_id': ans2.get('answer_id', ans2['answer_id']), 105 | 'category': category 106 | } 107 | if idx >= len(cur_reviews): 108 | review = get_eval(content, args.max_tokens) 109 | scores = parse_score(review) 110 | cur_js['content'] = review 111 | cur_js['tuple'] = scores 112 | review_file.write(json.dumps(cur_js) + '\n') 113 | review_file.flush() 114 | else: 115 | print(f'Skipping {idx} as we already have it.') 116 | idx += 1 117 | print(idx) 118 | review_file.close() 119 | -------------------------------------------------------------------------------- /mgm/eval/eval_pope.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | def eval_pope(answers, label_file): 6 | label_list = [json.loads(q)['label'] for q in open(label_file, 'r')] 7 | 8 | for answer in answers: 9 | text = answer['text'] 10 | 11 | # Only keep the first sentence 12 | if text.find('.') != -1: 13 | text = text.split('.')[0] 14 | 15 | text = text.replace(',', '') 16 | words = text.split(' ') 17 | if 'No' in words or 'not' in words or 'no' in words: 18 | answer['text'] = 'no' 19 | else: 20 | answer['text'] = 'yes' 21 | 22 | for i in range(len(label_list)): 23 | if label_list[i] == 'no': 24 | label_list[i] = 0 25 | else: 26 | label_list[i] = 1 27 | 28 | pred_list = [] 29 | for answer in answers: 30 | if answer['text'] == 'no': 31 | pred_list.append(0) 32 | else: 33 | pred_list.append(1) 34 | 35 | pos = 1 36 | neg = 0 37 | yes_ratio = pred_list.count(1) / len(pred_list) 38 | 39 | TP, TN, FP, FN = 0, 0, 0, 0 40 | for pred, label in zip(pred_list, label_list): 41 | if pred == pos and label == pos: 42 | TP += 1 43 | elif pred == pos and label == neg: 44 | FP += 1 45 | elif pred == neg and label == neg: 46 | TN += 1 47 | elif pred == neg and label == pos: 48 | FN += 1 49 | 50 | print('TP\tFP\tTN\tFN\t') 51 | print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN)) 52 | 53 | precision = float(TP) / float(TP + FP) 54 | recall = float(TP) / float(TP + FN) 55 | f1 = 2*precision*recall / (precision + recall) 56 | acc = (TP + TN) / (TP + TN + FP + FN) 57 | print('Accuracy: {}'.format(acc)) 58 | print('Precision: {}'.format(precision)) 59 | print('Recall: {}'.format(recall)) 60 | print('F1 score: {}'.format(f1)) 61 | print('Yes ratio: {}'.format(yes_ratio)) 62 | print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) ) 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument("--annotation-dir", type=str) 67 | parser.add_argument("--question-file", type=str) 68 | parser.add_argument("--result-file", type=str) 69 | args = parser.parse_args() 70 | 71 | questions = [json.loads(line) for line in open(args.question_file)] 72 | questions = {question['question_id']: question for question in questions} 73 | answers = [json.loads(q) for q in open(args.result_file)] 74 | for file in os.listdir(args.annotation_dir): 75 | assert file.startswith('coco_pope_') 76 | assert file.endswith('.json') 77 | category = file[10:-5] 78 | cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category] 79 | print('Category: {}, # samples: {}'.format(category, len(cur_answers))) 80 | eval_pope(cur_answers, os.path.join(args.annotation_dir, file)) 81 | print("====================================") 82 | -------------------------------------------------------------------------------- /mgm/eval/eval_science_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--base-dir', type=str) 11 | parser.add_argument('--result-file', type=str) 12 | parser.add_argument('--output-file', type=str) 13 | parser.add_argument('--output-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return -1 36 | return random.choice(range(len(choices))) 37 | 38 | 39 | if __name__ == "__main__": 40 | args = get_args() 41 | 42 | base_dir = args.base_dir 43 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 44 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 45 | predictions = [json.loads(line) for line in open(args.result_file)] 46 | predictions = {pred['question_id']: pred for pred in predictions} 47 | split_problems = {idx: problems[idx] for idx in split_indices} 48 | 49 | results = {'correct': [], 'incorrect': []} 50 | sqa_results = {} 51 | sqa_results['acc'] = None 52 | sqa_results['correct'] = None 53 | sqa_results['count'] = None 54 | sqa_results['results'] = {} 55 | sqa_results['outputs'] = {} 56 | 57 | for prob_id, prob in split_problems.items(): 58 | if prob_id not in predictions: 59 | pred = {'text': 'FAILED', 'prompt': 'Unknown'} 60 | pred_text = 'FAILED' 61 | else: 62 | pred = predictions[prob_id] 63 | pred_text = pred['text'] 64 | 65 | if pred_text in args.options: 66 | answer = pred_text 67 | elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ": 68 | answer = pred_text[0] 69 | else: 70 | pattern = re.compile(r'The answer is ([A-Z]).') 71 | res = pattern.findall(pred_text) 72 | if len(res) == 1: 73 | answer = res[0] # 'A', 'B', ... 74 | else: 75 | answer = "FAILED" 76 | 77 | pred_idx = get_pred_idx(answer, prob['choices'], args.options) 78 | 79 | analysis = { 80 | 'question_id': prob_id, 81 | 'parsed_ans': answer, 82 | 'ground_truth': args.options[prob['answer']], 83 | 'question': pred['prompt'], 84 | 'pred': pred_text, 85 | 'is_multimodal': '' in pred['prompt'], 86 | } 87 | 88 | sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options) 89 | sqa_results['outputs'][prob_id] = pred_text 90 | 91 | if pred_idx == prob['answer']: 92 | results['correct'].append(analysis) 93 | else: 94 | results['incorrect'].append(analysis) 95 | 96 | correct = len(results['correct']) 97 | total = len(results['correct']) + len(results['incorrect']) 98 | 99 | ###### IMG ###### 100 | multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']]) 101 | multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']]) 102 | multimodal_total = multimodal_correct + multimodal_incorrect 103 | ###### IMG ###### 104 | 105 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%') 106 | 107 | sqa_results['acc'] = correct / total * 100 108 | sqa_results['correct'] = correct 109 | sqa_results['count'] = total 110 | 111 | with open(args.output_file, 'w') as f: 112 | json.dump(results, f, indent=2) 113 | with open(args.output_result, 'w') as f: 114 | json.dump(sqa_results, f, indent=2) 115 | -------------------------------------------------------------------------------- /mgm/eval/eval_science_qa_gpt4.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import re 5 | import random 6 | from collections import defaultdict 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--base-dir', type=str) 12 | parser.add_argument('--gpt4-result', type=str) 13 | parser.add_argument('--our-result', type=str) 14 | parser.add_argument('--split', type=str, default='test') 15 | parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"]) 16 | return parser.parse_args() 17 | 18 | 19 | def convert_caps(results): 20 | fakecaps = [] 21 | for result in results: 22 | image_id = result['question_id'] 23 | caption = result['text'] 24 | fakecaps.append({"image_id": int(image_id), "caption": caption}) 25 | return fakecaps 26 | 27 | 28 | def get_pred_idx(prediction, choices, options): 29 | """ 30 | Get the index (e.g. 2) from the prediction (e.g. 'C') 31 | """ 32 | if prediction in options[:len(choices)]: 33 | return options.index(prediction) 34 | else: 35 | return random.choice(range(len(choices))) 36 | 37 | 38 | if __name__ == "__main__": 39 | args = get_args() 40 | 41 | base_dir = args.base_dir 42 | split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split] 43 | problems = json.load(open(os.path.join(base_dir, "problems.json"))) 44 | our_predictions = [json.loads(line) for line in open(args.our_result)] 45 | our_predictions = {pred['question_id']: pred for pred in our_predictions} 46 | split_problems = {idx: problems[idx] for idx in split_indices} 47 | 48 | gpt4_predictions = json.load(open(args.gpt4_result))['outputs'] 49 | 50 | results = defaultdict(lambda: 0) 51 | 52 | for prob_id, prob in split_problems.items(): 53 | if prob_id not in our_predictions: 54 | continue 55 | if prob_id not in gpt4_predictions: 56 | continue 57 | our_pred = our_predictions[prob_id]['text'] 58 | gpt4_pred = gpt4_predictions[prob_id] 59 | 60 | pattern = re.compile(r'The answer is ([A-Z]).') 61 | our_res = pattern.findall(our_pred) 62 | if len(our_res) == 1: 63 | our_answer = our_res[0] # 'A', 'B', ... 64 | else: 65 | our_answer = "FAILED" 66 | gpt4_res = pattern.findall(gpt4_pred) 67 | if len(gpt4_res) == 1: 68 | gpt4_answer = gpt4_res[0] # 'A', 'B', ... 69 | else: 70 | gpt4_answer = "FAILED" 71 | 72 | our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options) 73 | gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options) 74 | 75 | if gpt4_answer == 'FAILED': 76 | results['gpt4_failed'] += 1 77 | # continue 78 | gpt4_pred_idx = our_pred_idx 79 | # if our_pred_idx != prob['answer']: 80 | # print(our_predictions[prob_id]['prompt']) 81 | # print('-----------------') 82 | # print(f'LECTURE: {prob["lecture"]}') 83 | # print(f'SOLUTION: {prob["solution"]}') 84 | # print('=====================') 85 | else: 86 | # continue 87 | pass 88 | # gpt4_pred_idx = our_pred_idx 89 | 90 | if gpt4_pred_idx == prob['answer']: 91 | results['correct'] += 1 92 | else: 93 | results['incorrect'] += 1 94 | 95 | 96 | if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']: 97 | results['correct_upperbound'] += 1 98 | 99 | correct = results['correct'] 100 | total = results['correct'] + results['incorrect'] 101 | print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%') 102 | print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%') 103 | print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%') 104 | 105 | -------------------------------------------------------------------------------- /mgm/eval/eval_textvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import re 5 | 6 | from mgm.eval.m4c_evaluator import TextVQAAccuracyEvaluator 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--annotation-file', type=str) 12 | parser.add_argument('--result-file', type=str) 13 | parser.add_argument('--result-dir', type=str) 14 | return parser.parse_args() 15 | 16 | 17 | def prompt_processor(prompt): 18 | if prompt.startswith('OCR tokens: '): 19 | pattern = r"Question: (.*?) Short answer:" 20 | match = re.search(pattern, prompt, re.DOTALL) 21 | question = match.group(1) 22 | elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: 23 | if prompt.startswith('Reference OCR token:'): 24 | question = prompt.split('\n')[1] 25 | else: 26 | question = prompt.split('\n')[0] 27 | elif len(prompt.split('\n')) == 2: 28 | question = prompt.split('\n')[0] 29 | else: 30 | assert False 31 | 32 | return question.lower() 33 | 34 | 35 | def eval_single(annotation_file, result_file): 36 | experiment_name = os.path.splitext(os.path.basename(result_file))[0] 37 | print(experiment_name) 38 | annotations = json.load(open(annotation_file))['data'] 39 | annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations} 40 | results = [json.loads(line) for line in open(result_file)] 41 | 42 | pred_list = [] 43 | for result in results: 44 | annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] 45 | pred_list.append({ 46 | "pred_answer": result['text'], 47 | "gt_answers": annotation['answers'], 48 | }) 49 | 50 | evaluator = TextVQAAccuracyEvaluator() 51 | print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list))) 52 | 53 | 54 | if __name__ == "__main__": 55 | args = get_args() 56 | 57 | if args.result_file is not None: 58 | eval_single(args.annotation_file, args.result_file) 59 | 60 | if args.result_dir is not None: 61 | for result_file in sorted(os.listdir(args.result_dir)): 62 | if not result_file.endswith('.jsonl'): 63 | print(f'Skipping {result_file}') 64 | continue 65 | eval_single(args.annotation_file, os.path.join(args.result_dir, result_file)) 66 | -------------------------------------------------------------------------------- /mgm/eval/generate_webpage_data_from_table.py: -------------------------------------------------------------------------------- 1 | """Generate json file for webpage.""" 2 | import json 3 | import os 4 | import re 5 | 6 | # models = ['llama', 'alpaca', 'gpt35', 'bard'] 7 | models = ['vicuna'] 8 | 9 | 10 | def read_jsonl(path: str, key: str=None): 11 | data = [] 12 | with open(os.path.expanduser(path)) as f: 13 | for line in f: 14 | if not line: 15 | continue 16 | data.append(json.loads(line)) 17 | if key is not None: 18 | data.sort(key=lambda x: x[key]) 19 | data = {item[key]: item for item in data} 20 | return data 21 | 22 | 23 | def trim_hanging_lines(s: str, n: int) -> str: 24 | s = s.strip() 25 | for _ in range(n): 26 | s = s.split('\n', 1)[1].strip() 27 | return s 28 | 29 | 30 | if __name__ == '__main__': 31 | questions = read_jsonl('table/question.jsonl', key='question_id') 32 | 33 | # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id') 34 | # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id') 35 | # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id') 36 | # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id') 37 | vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id') 38 | ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id') 39 | 40 | review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id') 41 | # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id') 42 | # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id') 43 | # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id') 44 | # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id') 45 | 46 | records = [] 47 | for qid in questions.keys(): 48 | r = { 49 | 'id': qid, 50 | 'category': questions[qid]['category'], 51 | 'question': questions[qid]['text'], 52 | 'answers': { 53 | # 'alpaca': alpaca_answers[qid]['text'], 54 | # 'llama': llama_answers[qid]['text'], 55 | # 'bard': bard_answers[qid]['text'], 56 | # 'gpt35': gpt35_answers[qid]['text'], 57 | 'vicuna': vicuna_answers[qid]['text'], 58 | 'ours': ours_answers[qid]['text'], 59 | }, 60 | 'evaluations': { 61 | # 'alpaca': review_alpaca[qid]['text'], 62 | # 'llama': review_llama[qid]['text'], 63 | # 'bard': review_bard[qid]['text'], 64 | 'vicuna': review_vicuna[qid]['content'], 65 | # 'gpt35': review_gpt35[qid]['text'], 66 | }, 67 | 'scores': { 68 | 'vicuna': review_vicuna[qid]['tuple'], 69 | # 'alpaca': review_alpaca[qid]['score'], 70 | # 'llama': review_llama[qid]['score'], 71 | # 'bard': review_bard[qid]['score'], 72 | # 'gpt35': review_gpt35[qid]['score'], 73 | }, 74 | } 75 | 76 | # cleanup data 77 | cleaned_evals = {} 78 | for k, v in r['evaluations'].items(): 79 | v = v.strip() 80 | lines = v.split('\n') 81 | # trim the first line if it's a pair of numbers 82 | if re.match(r'\d+[, ]+\d+', lines[0]): 83 | lines = lines[1:] 84 | v = '\n'.join(lines) 85 | cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**') 86 | 87 | r['evaluations'] = cleaned_evals 88 | records.append(r) 89 | 90 | # Reorder the records, this is optional 91 | for r in records: 92 | if r['id'] <= 20: 93 | r['id'] += 60 94 | else: 95 | r['id'] -= 20 96 | for r in records: 97 | if r['id'] <= 50: 98 | r['id'] += 10 99 | elif 50 < r['id'] <= 60: 100 | r['id'] -= 50 101 | for r in records: 102 | if r['id'] == 7: 103 | r['id'] = 1 104 | elif r['id'] < 7: 105 | r['id'] += 1 106 | 107 | records.sort(key=lambda x: x['id']) 108 | 109 | # Write to file 110 | with open('webpage/data.json', 'w') as f: 111 | json.dump({'questions': records, 'models': models}, f, indent=2) 112 | -------------------------------------------------------------------------------- /mgm/eval/model_qa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria 3 | import torch 4 | import os 5 | import json 6 | from tqdm import tqdm 7 | import shortuuid 8 | 9 | from mgm.conversation import default_conversation 10 | from mgm.utils import disable_torch_init 11 | 12 | 13 | @torch.inference_mode() 14 | def eval_model(model_name, questions_file, answers_file): 15 | # Model 16 | disable_torch_init() 17 | model_name = os.path.expanduser(model_name) 18 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) 19 | model = AutoModelForCausalLM.from_pretrained(model_name, 20 | torch_dtype=torch.float16).cuda() 21 | 22 | 23 | ques_file = open(os.path.expanduser(questions_file), "r") 24 | ans_file = open(os.path.expanduser(answers_file), "w") 25 | for i, line in enumerate(tqdm(ques_file)): 26 | idx = json.loads(line)["question_id"] 27 | qs = json.loads(line)["text"] 28 | cat = json.loads(line)["category"] 29 | conv = default_conversation.copy() 30 | conv.append_message(conv.roles[0], qs) 31 | prompt = conv.get_prompt() 32 | inputs = tokenizer([prompt]) 33 | input_ids = torch.as_tensor(inputs.input_ids).cuda() 34 | output_ids = model.generate( 35 | input_ids, 36 | do_sample=True, 37 | use_cache=True, 38 | temperature=0.7, 39 | max_new_tokens=1024,) 40 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] 41 | try: 42 | index = outputs.index(conv.sep, len(prompt)) 43 | except ValueError: 44 | outputs += conv.sep 45 | index = outputs.index(conv.sep, len(prompt)) 46 | 47 | outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip() 48 | ans_id = shortuuid.uuid() 49 | ans_file.write(json.dumps({"question_id": idx, 50 | "text": outputs, 51 | "answer_id": ans_id, 52 | "model_id": model_name, 53 | "metadata": {}}) + "\n") 54 | ans_file.flush() 55 | ans_file.close() 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 60 | parser.add_argument("--question-file", type=str, default="tables/question.jsonl") 61 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 62 | args = parser.parse_args() 63 | 64 | eval_model(args.model_name, args.question_file, args.answers_file) -------------------------------------------------------------------------------- /mgm/eval/model_vqa_qbench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from tqdm import tqdm 4 | import json 5 | 6 | from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN 7 | from mgm.conversation import conv_templates, SeparatorStyle 8 | from mgm.model.builder import load_pretrained_model 9 | from mgm.utils import disable_torch_init 10 | from mgm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 11 | 12 | from PIL import Image 13 | 14 | import requests 15 | from PIL import Image 16 | from io import BytesIO 17 | 18 | 19 | def load_image(image_file): 20 | if image_file.startswith('http') or image_file.startswith('https'): 21 | response = requests.get(image_file) 22 | image = Image.open(BytesIO(response.content)).convert('RGB') 23 | else: 24 | image = Image.open(image_file).convert('RGB') 25 | return image 26 | 27 | 28 | def eval_model(args): 29 | # Model 30 | disable_torch_init() 31 | 32 | model_name = get_model_name_from_path(args.model_path) 33 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True) 34 | 35 | 36 | 37 | 38 | with open(args.questions_file) as f: 39 | llvqa_data = json.load(f) 40 | 41 | for i, llddata in enumerate(tqdm(llvqa_data)): 42 | filename = llddata["img_path"] 43 | if args.lang == "en": 44 | message = llddata["question"] + "\nChoose between one of the options as follows:\n" 45 | elif args.lang == "zh": 46 | message = llddata["question"] + "\在下列选项中选择一个:\n" 47 | else: 48 | raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert Q-Bench into more languages.") 49 | for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]): 50 | message += f"{choice} {ans}\n" 51 | qs = message 52 | 53 | if model.config.mm_use_im_start_end: 54 | qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs 55 | else: 56 | qs = DEFAULT_IMAGE_TOKEN + '\n' + qs 57 | 58 | if 'llama-2' in model_name.lower(): 59 | conv_mode = "llava_llama_2" 60 | elif "v1" in model_name.lower(): 61 | conv_mode = "llava_v1" 62 | elif "mpt" in model_name.lower(): 63 | conv_mode = "mpt" 64 | else: 65 | conv_mode = "llava_v0" 66 | 67 | if args.conv_mode is not None and conv_mode != args.conv_mode: 68 | print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode)) 69 | else: 70 | args.conv_mode = conv_mode 71 | 72 | conv = conv_templates[args.conv_mode].copy() 73 | conv.append_message(conv.roles[0], qs) 74 | conv.append_message(conv.roles[1], None) 75 | prompt = conv.get_prompt() 76 | 77 | image = load_image(args.image_folder + filename) 78 | image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda() 79 | 80 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 81 | 82 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 83 | keywords = [stop_str] 84 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 85 | 86 | 87 | with torch.inference_mode(): 88 | output_ids = model.generate( 89 | input_ids, 90 | images=image_tensor, 91 | num_beams=1, 92 | do_sample=False, 93 | temperature=0, 94 | max_new_tokens=1024, 95 | use_cache=True, 96 | stopping_criteria=[stopping_criteria]) 97 | 98 | input_token_len = input_ids.shape[1] 99 | n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() 100 | if n_diff_input_output > 0: 101 | print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') 102 | outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] 103 | outputs = outputs.strip() 104 | if outputs.endswith(stop_str): 105 | outputs = outputs[:-len(stop_str)] 106 | outputs = outputs.strip() 107 | llddata["response"] = outputs 108 | with open(args.answers_file, "a") as wf: 109 | json.dump(llddata, wf) 110 | 111 | if __name__ == "__main__": 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--model-path", type=str, default="llava-v1.5") 114 | parser.add_argument("--model-base", type=str, default=None) 115 | parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa") 116 | parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json") 117 | parser.add_argument("--answers-file", type=str, default="answer.jsonl") 118 | parser.add_argument("--conv-mode", type=str, default="llava_v1") 119 | parser.add_argument("--lang", type=str, default="en") 120 | args = parser.parse_args() 121 | 122 | eval_model(args) 123 | -------------------------------------------------------------------------------- /mgm/eval/qa_baseline_gpt35.py: -------------------------------------------------------------------------------- 1 | """Generate answers with GPT-3.5""" 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | import concurrent.futures 8 | 9 | import openai 10 | import tqdm 11 | import shortuuid 12 | 13 | MODEL = 'gpt-3.5-turbo' 14 | MODEL_ID = 'gpt-3.5-turbo:20230327' 15 | 16 | def get_answer(question_id: int, question: str, max_tokens: int): 17 | ans = { 18 | 'answer_id': shortuuid.uuid(), 19 | 'question_id': question_id, 20 | 'model_id': MODEL_ID, 21 | } 22 | for _ in range(3): 23 | try: 24 | response = openai.ChatCompletion.create( 25 | model=MODEL, 26 | messages=[{ 27 | 'role': 'system', 28 | 'content': 'You are a helpful assistant.' 29 | }, { 30 | 'role': 'user', 31 | 'content': question, 32 | }], 33 | max_tokens=max_tokens, 34 | ) 35 | ans['text'] = response['choices'][0]['message']['content'] 36 | return ans 37 | except Exception as e: 38 | print('[ERROR]', e) 39 | ans['text'] = '#ERROR#' 40 | time.sleep(1) 41 | return ans 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser(description='ChatGPT answer generation.') 46 | parser.add_argument('-q', '--question') 47 | parser.add_argument('-o', '--output') 48 | parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output') 49 | args = parser.parse_args() 50 | 51 | questions_dict = {} 52 | with open(os.path.expanduser(args.question)) as f: 53 | for line in f: 54 | if not line: 55 | continue 56 | q = json.loads(line) 57 | questions_dict[q['question_id']] = q['text'] 58 | 59 | answers = [] 60 | 61 | with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: 62 | futures = [] 63 | for qid, question in questions_dict.items(): 64 | future = executor.submit(get_answer, qid, question, args.max_tokens) 65 | futures.append(future) 66 | 67 | for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)): 68 | answers.append(future.result()) 69 | 70 | answers.sort(key=lambda x: x['question_id']) 71 | 72 | with open(os.path.expanduser(args.output), 'w') as f: 73 | table = [json.dumps(ans) for ans in answers] 74 | f.write('\n'.join(table)) 75 | -------------------------------------------------------------------------------- /mgm/eval/run_llava.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from mgm.constants import ( 5 | IMAGE_TOKEN_INDEX, 6 | DEFAULT_IMAGE_TOKEN, 7 | DEFAULT_IM_START_TOKEN, 8 | DEFAULT_IM_END_TOKEN, 9 | IMAGE_PLACEHOLDER, 10 | ) 11 | from mgm.conversation import conv_templates, SeparatorStyle 12 | from mgm.model.builder import load_pretrained_model 13 | from mgm.utils import disable_torch_init 14 | from mgm.mm_utils import ( 15 | process_images, 16 | tokenizer_image_token, 17 | get_model_name_from_path, 18 | ) 19 | 20 | from PIL import Image 21 | 22 | import requests 23 | from PIL import Image 24 | from io import BytesIO 25 | import re 26 | 27 | 28 | def image_parser(args): 29 | out = args.image_file.split(args.sep) 30 | return out 31 | 32 | 33 | def load_image(image_file): 34 | if image_file.startswith("http") or image_file.startswith("https"): 35 | response = requests.get(image_file) 36 | image = Image.open(BytesIO(response.content)).convert("RGB") 37 | else: 38 | image = Image.open(image_file).convert("RGB") 39 | return image 40 | 41 | 42 | def load_images(image_files): 43 | out = [] 44 | for image_file in image_files: 45 | image = load_image(image_file) 46 | out.append(image) 47 | return out 48 | 49 | 50 | def eval_model(args): 51 | # Model 52 | disable_torch_init() 53 | 54 | model_name = get_model_name_from_path(args.model_path) 55 | tokenizer, model, image_processor, context_len = load_pretrained_model( 56 | args.model_path, args.model_base, model_name 57 | ) 58 | 59 | qs = args.query 60 | image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN 61 | if IMAGE_PLACEHOLDER in qs: 62 | if model.config.mm_use_im_start_end: 63 | qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) 64 | else: 65 | qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) 66 | else: 67 | if model.config.mm_use_im_start_end: 68 | qs = image_token_se + "\n" + qs 69 | else: 70 | qs = DEFAULT_IMAGE_TOKEN + "\n" + qs 71 | 72 | if "llama-2" in model_name.lower(): 73 | conv_mode = "llava_llama_2" 74 | elif "mistral" in model_name.lower(): 75 | conv_mode = "mistral_instruct" 76 | elif "v1.6-34b" in model_name.lower(): 77 | conv_mode = "chatml_direct" 78 | elif "v1" in model_name.lower(): 79 | conv_mode = "llava_v1" 80 | elif "mpt" in model_name.lower(): 81 | conv_mode = "mpt" 82 | else: 83 | conv_mode = "llava_v0" 84 | 85 | if args.conv_mode is not None and conv_mode != args.conv_mode: 86 | print( 87 | "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( 88 | conv_mode, args.conv_mode, args.conv_mode 89 | ) 90 | ) 91 | else: 92 | args.conv_mode = conv_mode 93 | 94 | conv = conv_templates[args.conv_mode].copy() 95 | conv.append_message(conv.roles[0], qs) 96 | conv.append_message(conv.roles[1], None) 97 | prompt = conv.get_prompt() 98 | 99 | image_files = image_parser(args) 100 | images = load_images(image_files) 101 | images_tensor = process_images( 102 | images, 103 | image_processor, 104 | model.config 105 | ).to(model.device, dtype=torch.float16) 106 | 107 | input_ids = ( 108 | tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") 109 | .unsqueeze(0) 110 | .cuda() 111 | ) 112 | 113 | with torch.inference_mode(): 114 | output_ids = model.generate( 115 | input_ids, 116 | images=images_tensor, 117 | do_sample=True if args.temperature > 0 else False, 118 | temperature=args.temperature, 119 | top_p=args.top_p, 120 | num_beams=args.num_beams, 121 | max_new_tokens=args.max_new_tokens, 122 | use_cache=True, 123 | ) 124 | 125 | outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 126 | print(outputs) 127 | 128 | 129 | if __name__ == "__main__": 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument("--model-path", type=str, default="facebook/opt-350m") 132 | parser.add_argument("--model-base", type=str, default=None) 133 | parser.add_argument("--image-file", type=str, required=True) 134 | parser.add_argument("--query", type=str, required=True) 135 | parser.add_argument("--conv-mode", type=str, default=None) 136 | parser.add_argument("--sep", type=str, default=",") 137 | parser.add_argument("--temperature", type=float, default=0.2) 138 | parser.add_argument("--top_p", type=float, default=None) 139 | parser.add_argument("--num_beams", type=int, default=1) 140 | parser.add_argument("--max_new_tokens", type=int, default=512) 141 | args = parser.parse_args() 142 | 143 | eval_model(args) -------------------------------------------------------------------------------- /mgm/eval/summarize_gpt_review.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | 7 | import argparse 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.') 11 | parser.add_argument('-d', '--dir', default=None) 12 | parser.add_argument('-v', '--version', default=None) 13 | parser.add_argument('-s', '--select', nargs='*', default=None) 14 | parser.add_argument('-f', '--files', nargs='*', default=[]) 15 | parser.add_argument('-i', '--ignore', nargs='*', default=[]) 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == '__main__': 20 | args = parse_args() 21 | 22 | if args.ignore is not None: 23 | args.ignore = [int(x) for x in args.ignore] 24 | 25 | if len(args.files) > 0: 26 | review_files = args.files 27 | else: 28 | review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)] 29 | 30 | for review_file in sorted(review_files): 31 | config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '') 32 | if args.select is not None and any(x not in config for x in args.select): 33 | continue 34 | if '0613' in config: 35 | version = '0613' 36 | else: 37 | version = '0314' 38 | if args.version is not None and args.version != version: 39 | continue 40 | scores = defaultdict(list) 41 | print(config) 42 | with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f: 43 | for review_str in f: 44 | review = json.loads(review_str) 45 | if review['question_id'] in args.ignore: 46 | continue 47 | if 'category' in review: 48 | scores[review['category']].append(review['tuple']) 49 | scores['all'].append(review['tuple']) 50 | else: 51 | if 'tuple' in review: 52 | scores['all'].append(review['tuple']) 53 | else: 54 | scores['all'].append(review['score']) 55 | for k, v in sorted(scores.items()): 56 | stats = np.asarray(v).mean(0).tolist() 57 | stats = [round(x, 3) for x in stats] 58 | # print(k, stats, round(stats[1]/stats[0]*100, 1)) 59 | print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1)) 60 | print('=================================') 61 | -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mgm/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /mgm/eval/webpage/styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 3 | background-color: #f8f9fa; 4 | } 5 | 6 | .navbar-dark .navbar-nav .nav-link { 7 | color: #f1cf68; 8 | font-size: 1.1rem; 9 | padding: 0.5rem 0.6rem; 10 | } 11 | 12 | .card-header { 13 | font-weight: bold; 14 | } 15 | 16 | .card { 17 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 18 | transition: 0.3s; 19 | } 20 | 21 | .card:hover { 22 | box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2); 23 | } 24 | 25 | button { 26 | transition: background-color 0.3s; 27 | } 28 | 29 | button:hover { 30 | background-color: #007bff; 31 | } 32 | 33 | @media (max-width: 767px) { 34 | .form-row .form-group { 35 | margin-bottom: 10px; 36 | } 37 | } 38 | 39 | /* Extra styles */ 40 | 41 | .expandable-card .card-text-container { 42 | max-height: 200px; 43 | overflow-y: hidden; 44 | position: relative; 45 | } 46 | 47 | .expandable-card.expanded .card-text-container { 48 | max-height: none; 49 | } 50 | 51 | .expand-btn { 52 | position: relative; 53 | display: none; 54 | background-color: rgba(255, 255, 255, 0.8); 55 | color: #510c75; 56 | border-color: transparent; 57 | } 58 | 59 | .expand-btn:hover { 60 | background-color: rgba(200, 200, 200, 0.8); 61 | text-decoration: none; 62 | border-color: transparent; 63 | color: #510c75; 64 | } 65 | 66 | .expand-btn:focus { 67 | outline: none; 68 | text-decoration: none; 69 | } 70 | 71 | .expandable-card:not(.expanded) .card-text-container:after { 72 | content: ""; 73 | position: absolute; 74 | bottom: 0; 75 | left: 0; 76 | width: 100%; 77 | height: 90px; 78 | background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1)); 79 | } 80 | 81 | .expandable-card:not(.expanded) .expand-btn { 82 | margin-top: -40px; 83 | } 84 | 85 | .card-body { 86 | padding-bottom: 5px; 87 | } 88 | 89 | .vertical-flex-layout { 90 | justify-content: center; 91 | align-items: center; 92 | height: 100%; 93 | display: flex; 94 | flex-direction: column; 95 | gap: 5px; 96 | } 97 | 98 | .figure-img { 99 | max-width: 100%; 100 | height: auto; 101 | } 102 | 103 | .adjustable-font-size { 104 | font-size: calc(0.5rem + 2vw); 105 | } 106 | -------------------------------------------------------------------------------- /mgm/mm_utils.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from io import BytesIO 3 | import base64 4 | 5 | import torch 6 | from transformers import StoppingCriteria 7 | from mgm.constants import IMAGE_TOKEN_INDEX 8 | 9 | 10 | def load_image_from_base64(image): 11 | return Image.open(BytesIO(base64.b64decode(image))) 12 | 13 | 14 | def expand2square(pil_img, background_color): 15 | width, height = pil_img.size 16 | if width == height: 17 | return pil_img 18 | elif width > height: 19 | result = Image.new(pil_img.mode, (width, width), background_color) 20 | result.paste(pil_img, (0, (width - height) // 2)) 21 | return result 22 | else: 23 | result = Image.new(pil_img.mode, (height, height), background_color) 24 | result.paste(pil_img, ((height - width) // 2, 0)) 25 | return result 26 | 27 | 28 | def process_images(images, image_processor, model_cfg): 29 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 30 | new_images = [] 31 | if image_aspect_ratio == 'pad': 32 | for image in images: 33 | image = expand2square(image.convert('RGB'), tuple(int(x*255) for x in image_processor.image_mean)) 34 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 35 | new_images.append(image) 36 | else: 37 | return image_processor(images, return_tensors='pt')['pixel_values'] 38 | if all(x.shape == new_images[0].shape for x in new_images): 39 | new_images = torch.stack(new_images, dim=0) 40 | return new_images 41 | 42 | 43 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 44 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 45 | 46 | def insert_separator(X, sep): 47 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 48 | 49 | input_ids = [] 50 | offset = 0 51 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 52 | offset = 1 53 | input_ids.append(prompt_chunks[0][0]) 54 | 55 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 56 | input_ids.extend(x[offset:]) 57 | 58 | if return_tensors is not None: 59 | if return_tensors == 'pt': 60 | return torch.tensor(input_ids, dtype=torch.long) 61 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 62 | return input_ids 63 | 64 | 65 | def get_model_name_from_path(model_path): 66 | model_path = model_path.strip("/") 67 | model_paths = model_path.split("/") 68 | if model_paths[-1].startswith('checkpoint-'): 69 | return model_paths[-2] + "_" + model_paths[-1] 70 | else: 71 | return model_paths[-1] 72 | 73 | class KeywordsStoppingCriteria(StoppingCriteria): 74 | def __init__(self, keywords, tokenizer, input_ids): 75 | self.keywords = keywords 76 | self.keyword_ids = [] 77 | self.max_keyword_len = 0 78 | for keyword in keywords: 79 | cur_keyword_ids = tokenizer(keyword).input_ids 80 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 81 | cur_keyword_ids = cur_keyword_ids[1:] 82 | if len(cur_keyword_ids) > self.max_keyword_len: 83 | self.max_keyword_len = len(cur_keyword_ids) 84 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 85 | self.tokenizer = tokenizer 86 | self.start_len = input_ids.shape[1] 87 | 88 | def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 89 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 90 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 91 | for keyword_id in self.keyword_ids: 92 | truncated_output_ids = output_ids[0, -keyword_id.shape[0]:] 93 | if torch.equal(truncated_output_ids, keyword_id): 94 | return True 95 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 96 | for keyword in self.keywords: 97 | if keyword in outputs: 98 | return True 99 | return False 100 | 101 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 102 | outputs = [] 103 | for i in range(output_ids.shape[0]): 104 | outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores)) 105 | return all(outputs) -------------------------------------------------------------------------------- /mgm/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.mgm_llama import MGMLlamaForCausalLM 2 | try: 3 | from .language_model.mgm_mistral import MGMMistralForCausalLM 4 | from .language_model.mgm_mixtral import MGMMixtralForCausalLM 5 | from .language_model.mgm_gemma import MGMGemmaForCausalLM 6 | except: 7 | ImportWarning("New model not imported. Try to update Transformers.") -------------------------------------------------------------------------------- /mgm/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m mgm.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from mgm.model import * 10 | from mgm.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /mgm/model/language_model/mgm_mistral.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ------------------------------------------------------------------------ 15 | # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) 16 | # Copyright 2024 Yanwei Li 17 | # ------------------------------------------------------------------------ 18 | 19 | from typing import List, Optional, Tuple, Union 20 | 21 | import torch 22 | import torch.nn as nn 23 | 24 | from transformers import AutoConfig, AutoModelForCausalLM, \ 25 | MistralConfig, MistralModel, MistralForCausalLM 26 | 27 | from transformers.modeling_outputs import CausalLMOutputWithPast 28 | from transformers.generation.utils import GenerateOutput 29 | from transformers.generation.utils import logging 30 | 31 | from ..mgm_arch import MGMMetaModel, MGMMetaForCausalLM 32 | 33 | logger = logging.get_logger(__name__) 34 | 35 | class MGMConfig(MistralConfig): 36 | model_type = "mgm_mistral" 37 | 38 | 39 | class MGMMistralModel(MGMMetaModel, MistralModel): 40 | config_class = MGMConfig 41 | 42 | def __init__(self, config: MistralConfig): 43 | super(MGMMistralModel, self).__init__(config) 44 | # self.max_pos_idx = 0 45 | 46 | class MGMMistralForCausalLM(MistralForCausalLM, MGMMetaForCausalLM): 47 | config_class = MGMConfig 48 | 49 | def __init__(self, config): 50 | super(MistralForCausalLM, self).__init__(config) 51 | self.model = MGMMistralModel(config) 52 | # self.pretraining_tp = config.pretraining_tp 53 | self.vocab_size = config.vocab_size 54 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 55 | 56 | # Initialize weights and apply final processing 57 | self.post_init() 58 | 59 | def get_model(self): 60 | return self.model 61 | 62 | def forward( 63 | self, 64 | input_ids: torch.LongTensor = None, 65 | attention_mask: Optional[torch.Tensor] = None, 66 | position_ids: Optional[torch.LongTensor] = None, 67 | past_key_values: Optional[List[torch.FloatTensor]] = None, 68 | inputs_embeds: Optional[torch.FloatTensor] = None, 69 | labels: Optional[torch.LongTensor] = None, 70 | use_cache: Optional[bool] = None, 71 | output_attentions: Optional[bool] = None, 72 | output_hidden_states: Optional[bool] = None, 73 | images: Optional[torch.FloatTensor] = None, 74 | images_aux: Optional[torch.FloatTensor] = None, 75 | return_dict: Optional[bool] = None, 76 | ) -> Union[Tuple, CausalLMOutputWithPast]: 77 | 78 | if inputs_embeds is None: 79 | ( 80 | input_ids, 81 | position_ids, 82 | attention_mask, 83 | past_key_values, 84 | inputs_embeds, 85 | labels 86 | ) = self.prepare_inputs_labels_for_multimodal( 87 | input_ids, 88 | position_ids, 89 | attention_mask, 90 | past_key_values, 91 | labels, 92 | images, 93 | images_aux 94 | ) 95 | 96 | return super().forward( 97 | input_ids=input_ids, 98 | attention_mask=attention_mask, 99 | position_ids=position_ids, 100 | past_key_values=past_key_values, 101 | inputs_embeds=inputs_embeds, 102 | labels=labels, 103 | use_cache=use_cache, 104 | output_attentions=output_attentions, 105 | output_hidden_states=output_hidden_states, 106 | return_dict=return_dict 107 | ) 108 | 109 | @torch.no_grad() 110 | def generate( 111 | self, 112 | inputs: Optional[torch.Tensor] = None, 113 | images: Optional[torch.Tensor] = None, 114 | images_aux: Optional[torch.FloatTensor] = None, 115 | **kwargs, 116 | ) -> Union[GenerateOutput, torch.LongTensor]: 117 | position_ids = kwargs.pop("position_ids", None) 118 | attention_mask = kwargs.pop("attention_mask", None) 119 | if "inputs_embeds" in kwargs: 120 | raise NotImplementedError("`inputs_embeds` is not supported") 121 | 122 | if images is not None: 123 | ( 124 | inputs, 125 | position_ids, 126 | attention_mask, 127 | _, 128 | inputs_embeds, 129 | _ 130 | ) = self.prepare_inputs_labels_for_multimodal( 131 | inputs, 132 | position_ids, 133 | attention_mask, 134 | None, 135 | None, 136 | images, 137 | images_aux 138 | ) 139 | else: 140 | inputs_embeds = self.get_model().embed_tokens(inputs) 141 | 142 | return super().generate( 143 | position_ids=position_ids, 144 | attention_mask=attention_mask, 145 | inputs_embeds=inputs_embeds, 146 | **kwargs 147 | ) 148 | 149 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 150 | images = kwargs.pop("images", None) 151 | images_aux = kwargs.pop("images_aux", None) 152 | _inputs = super().prepare_inputs_for_generation( 153 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 154 | ) 155 | if images is not None: 156 | _inputs['images'] = images 157 | if images_aux is not None: 158 | _inputs['images_aux'] = images_aux 159 | return _inputs 160 | 161 | AutoConfig.register("mgm_mistral", MGMConfig) 162 | AutoModelForCausalLM.register(MGMConfig, MGMMistralForCausalLM) -------------------------------------------------------------------------------- /mgm/model/language_model/mgm_mixtral.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Haotian Liu 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ------------------------------------------------------------------------ 15 | # Modified from LLaVA (https://github.com/haotian-liu/LLaVA) 16 | # Copyright 2024 Yanwei Li 17 | # ------------------------------------------------------------------------ 18 | 19 | from typing import List, Optional, Tuple, Union 20 | 21 | import torch 22 | import torch.nn as nn 23 | 24 | from transformers import AutoConfig, AutoModelForCausalLM, \ 25 | MixtralConfig, MixtralModel, MixtralForCausalLM 26 | 27 | from transformers.modeling_outputs import CausalLMOutputWithPast 28 | from transformers.generation.utils import GenerateOutput 29 | from transformers.generation.utils import logging 30 | 31 | from ..mgm_arch import MGMMetaModel, MGMMetaForCausalLM 32 | 33 | logger = logging.get_logger(__name__) 34 | 35 | class MGMConfig(MixtralConfig): 36 | model_type = "mgm_mixtral" 37 | 38 | 39 | class MGMMixtralModel(MGMMetaModel, MixtralModel): 40 | config_class = MGMConfig 41 | 42 | def __init__(self, config: MixtralConfig): 43 | super(MGMMixtralModel, self).__init__(config) 44 | # self.max_pos_idx = 0 45 | 46 | class MGMMixtralForCausalLM(MixtralForCausalLM, MGMMetaForCausalLM): 47 | config_class = MGMConfig 48 | 49 | def __init__(self, config): 50 | super(MixtralForCausalLM, self).__init__(config) 51 | self.model = MGMMixtralModel(config) 52 | # self.pretraining_tp = config.pretraining_tp 53 | self.vocab_size = config.vocab_size 54 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 55 | 56 | # Initialize weights and apply final processing 57 | self.post_init() 58 | 59 | def get_model(self): 60 | return self.model 61 | 62 | def forward( 63 | self, 64 | input_ids: torch.LongTensor = None, 65 | attention_mask: Optional[torch.Tensor] = None, 66 | position_ids: Optional[torch.LongTensor] = None, 67 | past_key_values: Optional[List[torch.FloatTensor]] = None, 68 | inputs_embeds: Optional[torch.FloatTensor] = None, 69 | labels: Optional[torch.LongTensor] = None, 70 | use_cache: Optional[bool] = None, 71 | output_attentions: Optional[bool] = None, 72 | output_hidden_states: Optional[bool] = None, 73 | images: Optional[torch.FloatTensor] = None, 74 | images_aux: Optional[torch.FloatTensor] = None, 75 | return_dict: Optional[bool] = None, 76 | ) -> Union[Tuple, CausalLMOutputWithPast]: 77 | 78 | if inputs_embeds is None: 79 | ( 80 | input_ids, 81 | position_ids, 82 | attention_mask, 83 | past_key_values, 84 | inputs_embeds, 85 | labels 86 | ) = self.prepare_inputs_labels_for_multimodal( 87 | input_ids, 88 | position_ids, 89 | attention_mask, 90 | past_key_values, 91 | labels, 92 | images, 93 | images_aux 94 | ) 95 | 96 | return super().forward( 97 | input_ids=input_ids, 98 | attention_mask=attention_mask, 99 | position_ids=position_ids, 100 | past_key_values=past_key_values, 101 | inputs_embeds=inputs_embeds, 102 | labels=labels, 103 | use_cache=use_cache, 104 | output_attentions=output_attentions, 105 | output_hidden_states=output_hidden_states, 106 | return_dict=return_dict 107 | ) 108 | 109 | @torch.no_grad() 110 | def generate( 111 | self, 112 | inputs: Optional[torch.Tensor] = None, 113 | images: Optional[torch.Tensor] = None, 114 | images_aux: Optional[torch.FloatTensor] = None, 115 | **kwargs, 116 | ) -> Union[GenerateOutput, torch.LongTensor]: 117 | position_ids = kwargs.pop("position_ids", None) 118 | attention_mask = kwargs.pop("attention_mask", None) 119 | if "inputs_embeds" in kwargs: 120 | raise NotImplementedError("`inputs_embeds` is not supported") 121 | 122 | if images is not None: 123 | ( 124 | inputs, 125 | position_ids, 126 | attention_mask, 127 | _, 128 | inputs_embeds, 129 | _ 130 | ) = self.prepare_inputs_labels_for_multimodal( 131 | inputs, 132 | position_ids, 133 | attention_mask, 134 | None, 135 | None, 136 | images, 137 | images_aux 138 | ) 139 | else: 140 | inputs_embeds = self.get_model().embed_tokens(inputs) 141 | 142 | return super().generate( 143 | position_ids=position_ids, 144 | attention_mask=attention_mask, 145 | inputs_embeds=inputs_embeds, 146 | **kwargs 147 | ) 148 | 149 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): 150 | images = kwargs.pop("images", None) 151 | images_aux = kwargs.pop("images_aux", None) 152 | _inputs = super().prepare_inputs_for_generation( 153 | input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs 154 | ) 155 | if images is not None: 156 | _inputs['images'] = images 157 | if images_aux is not None: 158 | _inputs['images_aux'] = images_aux 159 | return _inputs 160 | 161 | AutoConfig.register("mgm_mixtral", MGMConfig) 162 | AutoModelForCausalLM.register(MGMConfig, MGMMixtralForCausalLM) -------------------------------------------------------------------------------- /mgm/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | from .eva_encoder import EVAVisionTower 4 | from .openclip_encoder import OpenCLIPVisionTower 5 | 6 | 7 | def build_vision_tower(vision_tower_cfg, **kwargs): 8 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 9 | image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "../processor/clip-patch14-224")) 10 | 11 | if not os.path.exists(vision_tower): 12 | raise ValueError(f'Not find vision tower: {vision_tower}') 13 | 14 | if "openai" in vision_tower.lower() or "ShareGPT4V" in vision_tower: 15 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 16 | elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower(): 17 | return EVAVisionTower(vision_tower, image_processor, args=vision_tower_cfg, **kwargs) 18 | else: 19 | raise ValueError(f'Unknown vision tower: {vision_tower}') 20 | 21 | 22 | def build_vision_tower_aux(vision_tower_cfg, **kwargs): 23 | vision_tower_aux = getattr(vision_tower_cfg, 'mm_vision_tower_aux', getattr(vision_tower_cfg, 'vision_tower_aux', None)) 24 | 25 | if not os.path.exists(vision_tower_aux): 26 | raise ValueError(f'Not find vision tower: {vision_tower_aux}') 27 | 28 | if "openclip" in vision_tower_aux.lower(): 29 | return OpenCLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs) 30 | elif "openai" in vision_tower_aux.lower(): 31 | return CLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs) 32 | else: 33 | raise ValueError(f'Unknown vision tower: {vision_tower_aux}') -------------------------------------------------------------------------------- /mgm/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | from ..processor.video_processor import VideoFramesProcessor 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | self.is_optimize = getattr(args, 'optimize_vision_tower', False) 17 | 18 | if not delay_load: 19 | self.load_model() 20 | elif getattr(args, 'unfreeze_mm_vision_tower', False): 21 | self.load_model() 22 | else: 23 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 24 | 25 | def load_model(self): 26 | self.image_processor = VideoFramesProcessor.from_pretrained(self.vision_tower_name) 27 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 28 | self.vision_tower.requires_grad_(False) 29 | 30 | self.is_loaded = True 31 | 32 | def feature_select(self, image_forward_outs): 33 | image_features = image_forward_outs.hidden_states[self.select_layer] 34 | if self.select_feature == 'patch': 35 | image_features = image_features[:, 1:] 36 | elif self.select_feature == 'cls_patch': 37 | image_features = image_features 38 | else: 39 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 40 | return image_features 41 | 42 | def image_forward(self, images): 43 | if type(images) is list: 44 | image_features = [] 45 | for image in images: 46 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 47 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 48 | image_features.append(image_feature) 49 | else: 50 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 51 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 52 | 53 | return image_features 54 | 55 | def forward(self, images): 56 | if not self.is_optimize: 57 | with torch.no_grad(): 58 | image_features = self.image_forward(images) 59 | else: 60 | image_features = self.image_forward(images) 61 | 62 | return image_features 63 | 64 | @property 65 | def dummy_feature(self): 66 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 67 | 68 | @property 69 | def dtype(self): 70 | return self.vision_tower.dtype 71 | 72 | @property 73 | def device(self): 74 | return self.vision_tower.device 75 | 76 | @property 77 | def config(self): 78 | if self.is_loaded: 79 | return self.vision_tower.config 80 | else: 81 | return self.cfg_only 82 | 83 | @property 84 | def hidden_size(self): 85 | return self.config.hidden_size 86 | 87 | @property 88 | def num_patches(self): 89 | return (self.config.image_size // self.config.patch_size) ** 2 -------------------------------------------------------------------------------- /mgm/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | class IdentityMap(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def forward(self, x, *args, **kwargs): 10 | return x 11 | 12 | @property 13 | def config(self): 14 | return {"mm_projector_type": 'identity'} 15 | 16 | 17 | class SimpleResBlock(nn.Module): 18 | def __init__(self, channels): 19 | super().__init__() 20 | self.pre_norm = nn.LayerNorm(channels) 21 | 22 | self.proj = nn.Sequential( 23 | nn.Linear(channels, channels), 24 | nn.GELU(), 25 | nn.Linear(channels, channels) 26 | ) 27 | def forward(self, x): 28 | x = self.pre_norm(x) 29 | return x + self.proj(x) 30 | 31 | 32 | def build_vision_projector(config, delay_load=False, **kwargs): 33 | projector_type = getattr(config, 'mm_projector_type', 'linear') 34 | 35 | if projector_type == 'linear': 36 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 37 | 38 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 39 | if mlp_gelu_match: 40 | mlp_depth = int(mlp_gelu_match.group(1)) 41 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 42 | for _ in range(1, mlp_depth): 43 | modules.append(nn.GELU()) 44 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 45 | return nn.Sequential(*modules) 46 | 47 | if projector_type == 'identity': 48 | return IdentityMap() 49 | 50 | raise ValueError(f'Unknown projector type: {projector_type}') -------------------------------------------------------------------------------- /mgm/model/processor/video_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import CLIPImageProcessor 2 | from transformers.image_processing_utils import BatchFeature, get_size_dict 3 | from transformers.image_transforms import get_resize_output_image_size 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | import numpy as np 9 | 10 | 11 | class VideoFramesProcessor(CLIPImageProcessor): 12 | 13 | def __init__(self, **kwargs): 14 | super().__init__(**kwargs) 15 | 16 | def preprocess(self, images, **kwargs): 17 | if not isinstance(images, np.ndarray): 18 | return super().preprocess(images=images, **kwargs) 19 | 20 | do_resize = kwargs.get('do_resize', self.do_resize) 21 | size = kwargs.get('size', self.size) 22 | size = get_size_dict(size, param_name="size", default_to_square=False) 23 | do_center_crop = kwargs.get('do_center_crop', self.do_center_crop) 24 | crop_size = kwargs.get('crop_size', self.crop_size) 25 | crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) 26 | do_rescale = kwargs.get('do_rescale', self.do_rescale) 27 | rescale_factor = kwargs.get('rescale_factor', self.rescale_factor) 28 | do_normalize = kwargs.get('do_normalize', self.do_normalize) 29 | image_mean = kwargs.get('image_mean', self.image_mean) 30 | image_std = kwargs.get('image_std', self.image_std) 31 | return_tensors = kwargs.get('return_tensors', None) 32 | 33 | def resize(images, output_size): 34 | images = images.permute((0, 3, 1, 2)) 35 | images = F.interpolate(images, size=output_size, mode='bicubic') 36 | images = images.permute((0, 2, 3, 1)) 37 | return images 38 | 39 | def center_crop(images, crop_size): 40 | crop_width, crop_height = crop_size["width"], crop_size["height"] 41 | img_width, img_height = images.shape[1:3] 42 | x = (img_width - crop_width) // 2 43 | y = (img_height - crop_height) // 2 44 | images = images[:, x:x+crop_width, y:y+crop_height] 45 | return images 46 | 47 | def rescale(images, rescale_factor): 48 | images = images * rescale_factor 49 | return images 50 | 51 | def normalize(images, mean, std): 52 | mean = torch.tensor(mean) 53 | std = torch.tensor(std) 54 | images = (images - mean) / std 55 | return images 56 | 57 | images = torch.from_numpy(images).float() 58 | 59 | if do_resize: 60 | output_size = get_resize_output_image_size(images[0], size=size["shortest_edge"], default_to_square=False) 61 | images = resize(images, output_size) 62 | 63 | if do_center_crop: 64 | images = center_crop(images, crop_size) 65 | 66 | if do_rescale: 67 | images = rescale(images, rescale_factor) 68 | 69 | if do_normalize: 70 | images = normalize(images, image_mean, image_std) 71 | 72 | images = images.permute((0, 3, 1, 2)) 73 | data = {"pixel_values": images} 74 | return BatchFeature(data=data, tensor_type=return_tensors) 75 | -------------------------------------------------------------------------------- /mgm/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/__init__.py -------------------------------------------------------------------------------- /mgm/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /mgm/serve/examples/monday.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/monday.jpg -------------------------------------------------------------------------------- /mgm/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /mgm/serve/examples/woolen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/woolen.png -------------------------------------------------------------------------------- /mgm/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /mgm/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from mgm.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /mgm/train/llama_xformers_attn_monkey_patch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments 3 | """ 4 | 5 | import logging 6 | import math 7 | from typing import Optional, Tuple 8 | 9 | import torch 10 | import transformers.models.llama.modeling_llama 11 | from torch import nn 12 | 13 | try: 14 | import xformers.ops 15 | except ImportError: 16 | logging.error("xformers not found! Please install it before trying to use it.") 17 | 18 | 19 | def replace_llama_attn_with_xformers_attn(): 20 | transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward 21 | 22 | 23 | def xformers_forward( 24 | self, 25 | hidden_states: torch.Tensor, 26 | attention_mask: Optional[torch.Tensor] = None, 27 | position_ids: Optional[torch.LongTensor] = None, 28 | past_key_value: Optional[Tuple[torch.Tensor]] = None, 29 | output_attentions: bool = False, 30 | use_cache: bool = False, 31 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: 32 | # pylint: disable=duplicate-code 33 | bsz, q_len, _ = hidden_states.size() 34 | 35 | query_states = ( 36 | self.q_proj(hidden_states) 37 | .view(bsz, q_len, self.num_heads, self.head_dim) 38 | .transpose(1, 2) 39 | ) 40 | key_states = ( 41 | self.k_proj(hidden_states) 42 | .view(bsz, q_len, self.num_heads, self.head_dim) 43 | .transpose(1, 2) 44 | ) 45 | value_states = ( 46 | self.v_proj(hidden_states) 47 | .view(bsz, q_len, self.num_heads, self.head_dim) 48 | .transpose(1, 2) 49 | ) 50 | 51 | kv_seq_len = key_states.shape[-2] 52 | if past_key_value is not None: 53 | kv_seq_len += past_key_value[0].shape[-2] 54 | cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) 55 | ( 56 | query_states, 57 | key_states, 58 | ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb( 59 | query_states, key_states, cos, sin, position_ids 60 | ) 61 | # [bsz, nh, t, hd] 62 | 63 | if past_key_value is not None: 64 | # reuse k, v, self_attention 65 | key_states = torch.cat([past_key_value[0], key_states], dim=2) 66 | value_states = torch.cat([past_key_value[1], value_states], dim=2) 67 | 68 | past_key_value = (key_states, value_states) if use_cache else None 69 | 70 | # We only apply xformers optimizations if we don't need to output the whole attention matrix 71 | if not output_attentions: 72 | query_states = query_states.transpose(1, 2) 73 | key_states = key_states.transpose(1, 2) 74 | value_states = value_states.transpose(1, 2) 75 | 76 | # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros. 77 | # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros. 78 | if attention_mask is None or attention_mask[0, 0, 0, 1] == 0: 79 | # input and output should be of form (bsz, q_len, num_heads, head_dim) 80 | attn_output = xformers.ops.memory_efficient_attention( 81 | query_states, key_states, value_states, attn_bias=None 82 | ) 83 | else: 84 | # input and output should be of form (bsz, q_len, num_heads, head_dim) 85 | attn_output = xformers.ops.memory_efficient_attention( 86 | query_states, 87 | key_states, 88 | value_states, 89 | attn_bias=xformers.ops.LowerTriangularMask(), 90 | ) 91 | attn_weights = None 92 | else: 93 | attn_weights = torch.matmul( 94 | query_states, key_states.transpose(2, 3) 95 | ) / math.sqrt(self.head_dim) 96 | 97 | if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): 98 | raise ValueError( 99 | f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" 100 | f" {attn_weights.size()}" 101 | ) 102 | 103 | if attention_mask is not None: 104 | if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): 105 | raise ValueError( 106 | f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" 107 | ) 108 | attn_weights = attn_weights + attention_mask 109 | attn_weights = torch.max( 110 | attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) 111 | ) 112 | 113 | # upcast attention to fp32 114 | attn_weights = nn.functional.softmax( 115 | attn_weights, dim=-1, dtype=torch.float32 116 | ).to(query_states.dtype) 117 | attn_output = torch.matmul(attn_weights, value_states) 118 | 119 | if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): 120 | raise ValueError( 121 | f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" 122 | f" {attn_output.size()}" 123 | ) 124 | 125 | attn_output = attn_output.transpose(1, 2) 126 | 127 | attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) 128 | attn_output = self.o_proj(attn_output) 129 | return attn_output, attn_weights, past_key_value 130 | -------------------------------------------------------------------------------- /mgm/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from mgm.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") -------------------------------------------------------------------------------- /mgm/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from mgm.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from mgm.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /mgm/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import logging.handlers 4 | import os 5 | import sys 6 | 7 | import requests 8 | 9 | from mgm.constants import LOGDIR 10 | 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." 13 | 14 | handler = None 15 | 16 | 17 | def build_logger(logger_name, logger_filename): 18 | global handler 19 | 20 | formatter = logging.Formatter( 21 | fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", 22 | datefmt="%Y-%m-%d %H:%M:%S", 23 | ) 24 | 25 | # Set the format of root handlers 26 | if not logging.getLogger().handlers: 27 | logging.basicConfig(level=logging.INFO) 28 | logging.getLogger().handlers[0].setFormatter(formatter) 29 | 30 | # Redirect stdout and stderr to loggers 31 | stdout_logger = logging.getLogger("stdout") 32 | stdout_logger.setLevel(logging.INFO) 33 | sl = StreamToLogger(stdout_logger, logging.INFO) 34 | sys.stdout = sl 35 | 36 | stderr_logger = logging.getLogger("stderr") 37 | stderr_logger.setLevel(logging.ERROR) 38 | sl = StreamToLogger(stderr_logger, logging.ERROR) 39 | sys.stderr = sl 40 | 41 | # Get logger 42 | logger = logging.getLogger(logger_name) 43 | logger.setLevel(logging.INFO) 44 | 45 | # Add a file handler for all loggers 46 | if handler is None: 47 | os.makedirs(LOGDIR, exist_ok=True) 48 | filename = os.path.join(LOGDIR, logger_filename) 49 | handler = logging.handlers.TimedRotatingFileHandler( 50 | filename, when='D', utc=True, encoding='UTF-8') 51 | handler.setFormatter(formatter) 52 | 53 | for name, item in logging.root.manager.loggerDict.items(): 54 | if isinstance(item, logging.Logger): 55 | item.addHandler(handler) 56 | 57 | return logger 58 | 59 | 60 | class StreamToLogger(object): 61 | """ 62 | Fake file-like stream object that redirects writes to a logger instance. 63 | """ 64 | def __init__(self, logger, log_level=logging.INFO): 65 | self.terminal = sys.stdout 66 | self.logger = logger 67 | self.log_level = log_level 68 | self.linebuf = '' 69 | 70 | def __getattr__(self, attr): 71 | return getattr(self.terminal, attr) 72 | 73 | def write(self, buf): 74 | temp_linebuf = self.linebuf + buf 75 | self.linebuf = '' 76 | for line in temp_linebuf.splitlines(True): 77 | # From the io.TextIOWrapper docs: 78 | # On output, if newline is None, any '\n' characters written 79 | # are translated to the system default line separator. 80 | # By default sys.stdout.write() expects '\n' newlines and then 81 | # translates them so this is still cross platform. 82 | if line[-1] == '\n': 83 | self.logger.log(self.log_level, line.rstrip()) 84 | else: 85 | self.linebuf += line 86 | 87 | def flush(self): 88 | if self.linebuf != '': 89 | self.logger.log(self.log_level, self.linebuf.rstrip()) 90 | self.linebuf = '' 91 | 92 | 93 | def disable_torch_init(): 94 | """ 95 | Disable the redundant torch default initialization to accelerate model creation. 96 | """ 97 | import torch 98 | setattr(torch.nn.Linear, "reset_parameters", lambda self: None) 99 | setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) 100 | 101 | 102 | def violates_moderation(text): 103 | """ 104 | Check whether the text violates OpenAI moderation API. 105 | """ 106 | url = "https://api.openai.com/v1/moderations" 107 | headers = {"Content-Type": "application/json", 108 | "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} 109 | text = text.replace("\n", "") 110 | data = "{" + '"input": ' + f'"{text}"' + "}" 111 | data = data.encode("utf-8") 112 | try: 113 | ret = requests.post(url, headers=headers, data=data, timeout=5) 114 | flagged = ret.json()["results"][0]["flagged"] 115 | except requests.exceptions.RequestException as e: 116 | flagged = False 117 | except KeyError as e: 118 | flagged = False 119 | 120 | return flagged 121 | 122 | 123 | def pretty_print_semaphore(semaphore): 124 | if semaphore is None: 125 | return "None" 126 | return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" 127 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "mgm" 7 | version = "1.0.0" 8 | description = "Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "torch==2.0.1", "torchvision==0.15.2", 17 | "transformers==4.36.2", "tokenizers==0.15.0", "sentencepiece==0.1.99", "shortuuid", 18 | "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0", 19 | "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2", 20 | "gradio==3.35.2", "gradio_client==0.2.9", 21 | "requests", "httpx==0.24.0", "uvicorn", "fastapi", 22 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.9.16", 23 | ] 24 | 25 | [project.optional-dependencies] 26 | train = ["deepspeed==0.11.1", "ninja", "wandb"] 27 | build = ["build", "twine"] 28 | 29 | [project.urls] 30 | "Homepage" = "https://github.com/dvlab-research/MGM" 31 | "Bug Tracker" = "https://github.com/dvlab-research/MGM/issues" 32 | 33 | [tool.setuptools.packages.find] 34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "data*", "model_zoo*", "work_dirs*", "project*"] 35 | 36 | [tool.wheel] 37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "data*", "model_zoo*", "work_dirs*", "project*"] 38 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/convert_mmbench_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str, required=True) 9 | parser.add_argument("--result-dir", type=str, required=True) 10 | parser.add_argument("--upload-dir", type=str, required=True) 11 | parser.add_argument("--experiment", type=str, required=True) 12 | 13 | return parser.parse_args() 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | 18 | df = pd.read_table(args.annotation_file) 19 | 20 | cur_df = df.copy() 21 | cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category']) 22 | cur_df.insert(6, 'prediction', None) 23 | for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")): 24 | pred = json.loads(pred) 25 | cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text'] 26 | 27 | cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl') 28 | -------------------------------------------------------------------------------- /scripts/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data['question_id'] 15 | cur_result[f'v1_{qid}'] = data['text'] 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/convert_seed_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--annotation-file", type=str) 9 | parser.add_argument("--result-file", type=str) 10 | parser.add_argument("--result-upload-file", type=str) 11 | return parser.parse_args() 12 | 13 | 14 | def eval_single(result_file, eval_only_type=None): 15 | results = {} 16 | for line in open(result_file): 17 | row = json.loads(line) 18 | results[row['question_id']] = row 19 | 20 | type_counts = {} 21 | correct_counts = {} 22 | for question_data in data['questions']: 23 | if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue 24 | data_type = question_data['question_type_id'] 25 | type_counts[data_type] = type_counts.get(data_type, 0) + 1 26 | try: 27 | question_id = int(question_data['question_id']) 28 | except: 29 | question_id = question_data['question_id'] 30 | if question_id not in results: 31 | correct_counts[data_type] = correct_counts.get(data_type, 0) 32 | continue 33 | row = results[question_id] 34 | if row['text'] == question_data['answer']: 35 | correct_counts[data_type] = correct_counts.get(data_type, 0) + 1 36 | 37 | total_count = 0 38 | total_correct = 0 39 | for data_type in sorted(type_counts.keys()): 40 | accuracy = correct_counts[data_type] / type_counts[data_type] * 100 41 | if eval_only_type is None: 42 | print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%") 43 | 44 | total_count += type_counts[data_type] 45 | total_correct += correct_counts[data_type] 46 | 47 | total_accuracy = total_correct / total_count * 100 48 | if eval_only_type is None: 49 | print(f"Total accuracy: {total_accuracy:.2f}%") 50 | else: 51 | print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%") 52 | 53 | return results 54 | 55 | if __name__ == "__main__": 56 | args = get_args() 57 | data = json.load(open(args.annotation_file)) 58 | ques_type_id_to_name = {id:n for n,id in data['question_type'].items()} 59 | 60 | results = eval_single(args.result_file) 61 | eval_single(args.result_file, eval_only_type='image') 62 | eval_single(args.result_file, eval_only_type='video') 63 | 64 | with open(args.result_upload_file, 'w') as fp: 65 | for question in data['questions']: 66 | qid = question['question_id'] 67 | if qid in results: 68 | result = results[qid] 69 | else: 70 | result = results[int(qid)] 71 | fp.write(json.dumps({ 72 | 'question_id': qid, 73 | 'prediction': result['text'] 74 | }) + '\n') 75 | -------------------------------------------------------------------------------- /scripts/extract_mm_projector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just a utility that I use to extract the projector for quantized models. 3 | It is NOT necessary at all to train, or run inference/serve demos. 4 | Use this script ONLY if you fully understand its implications. 5 | """ 6 | 7 | 8 | import os 9 | import argparse 10 | import torch 11 | import json 12 | from collections import defaultdict 13 | 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='Extract MMProjector weights') 17 | parser.add_argument('--model-path', type=str, help='model folder') 18 | parser.add_argument('--output', type=str, help='output file') 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parse_args() 25 | 26 | keys_to_match = ['mm_projector'] 27 | ckpt_to_key = defaultdict(list) 28 | try: 29 | model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json'))) 30 | for k, v in model_indices['weight_map'].items(): 31 | if any(key_match in k for key_match in keys_to_match): 32 | ckpt_to_key[v].append(k) 33 | except FileNotFoundError: 34 | # Smaller models or model checkpoints saved by DeepSpeed. 35 | v = 'pytorch_model.bin' 36 | for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys(): 37 | if any(key_match in k for key_match in keys_to_match): 38 | ckpt_to_key[v].append(k) 39 | 40 | loaded_weights = {} 41 | 42 | for ckpt_name, weight_keys in ckpt_to_key.items(): 43 | ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu') 44 | for k in weight_keys: 45 | loaded_weights[k] = ckpt[k] 46 | 47 | torch.save(loaded_weights, args.output) 48 | -------------------------------------------------------------------------------- /scripts/gemma/eval/math_vista.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-2B" 9 | OPENAIKEY="" 10 | OPENAIBASE="" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \ 14 | --model-path work_dirs/$CKPT \ 15 | --question-file data/MGM-Eval/MathVista/testmini.json \ 16 | --image-folder data/MGM-Eval/MathVista \ 17 | --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode gemma & 22 | done 23 | 24 | wait 25 | 26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl 27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MathVista/extract_answer.py \ 38 | --output_file $output_file \ 39 | --api_key $OPENAIKEY \ 40 | --api_base $OPENAIBASE 41 | 42 | python mgm/eval/MathVista/calculate_score.py \ 43 | --output_file $output_file \ 44 | --score_file $score_file \ 45 | --gt_file data/MGM-Eval/MathVista/testmini.json 46 | -------------------------------------------------------------------------------- /scripts/gemma/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-2B" 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_mmbench \ 7 | --model-path ./work_dirs/$CKPT \ 8 | --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode gemma 13 | 14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment $CKPT 21 | -------------------------------------------------------------------------------- /scripts/gemma/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CKPT="MGM/MGM-2B" 3 | 4 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \ 5 | --model-path work_dirs/$CKPT \ 6 | --question-file data/MGM-Eval/MME/llava_mme.jsonl \ 7 | --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \ 8 | --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode gemma 11 | 12 | cd data/MGM-Eval/MME 13 | 14 | python convert_answer_to_mme.py --experiment $CKPT 15 | 16 | cd eval_tool 17 | 18 | python calculation.py --results_dir answers/$CKPT 19 | -------------------------------------------------------------------------------- /scripts/gemma/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-2B" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "validation" \ 21 | --conv-mode gemma & 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json -------------------------------------------------------------------------------- /scripts/gemma/eval/mmmu_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-2B" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "test" \ 21 | --conv-mode gemma & 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | 37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json -------------------------------------------------------------------------------- /scripts/gemma/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-2B" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \ 12 | --model-path work_dirs/$CKPT \ 13 | --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \ 14 | --image-folder data/MGM-Eval/mm-vet/images \ 15 | --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode gemma & 20 | done 21 | 22 | wait 23 | 24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl 25 | # Clear out the output file if it exists. 26 | > "$output_file" 27 | 28 | # Loop through the indices and concatenate each file. 29 | for IDX in $(seq 0 $((CHUNKS-1))); do 30 | cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 31 | done 32 | 33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT 34 | 35 | python scripts/convert_mmvet_for_eval.py \ 36 | --src $output_file \ 37 | --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json 38 | 39 | -------------------------------------------------------------------------------- /scripts/gemma/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-2B" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \ 13 | --model-path ./work_dirs/$CKPT \ 14 | --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 15 | --image-folder ./data/MGM-Eval/textvqa/train_images \ 16 | --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode gemma & 21 | done 22 | 23 | wait 24 | 25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python -m mgm.eval.eval_textvqa \ 36 | --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \ 37 | --result-file $output_file 38 | -------------------------------------------------------------------------------- /scripts/gemma/train/stage_1_2_full_gemma_v2b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-2B-Pretrain 3 | FINETUNE_NAME=MGM-2B 4 | AUX_SIZE=768 5 | 6 | deepspeed mgm/train/train_mem.py \ 7 | --deepspeed ./scripts/zero2_offload.json \ 8 | --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \ 9 | --version gemma \ 10 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 11 | --image_folder ./data/MGM-Pretrain \ 12 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 13 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 14 | --mm_projector_type mlp2x_gelu \ 15 | --tune_mm_mlp_adapter True \ 16 | --mm_vision_select_layer -2 \ 17 | --mm_use_im_start_end False \ 18 | --mm_use_im_patch_token False \ 19 | --image_size_aux $AUX_SIZE \ 20 | --bf16 True \ 21 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 22 | --num_train_epochs 1 \ 23 | --per_device_train_batch_size 8 \ 24 | --per_device_eval_batch_size 4 \ 25 | --gradient_accumulation_steps 4 \ 26 | --evaluation_strategy "no" \ 27 | --save_strategy "steps" \ 28 | --save_steps 24000 \ 29 | --save_total_limit 1 \ 30 | --learning_rate 1e-3 \ 31 | --weight_decay 0. \ 32 | --warmup_ratio 0.03 \ 33 | --lr_scheduler_type "cosine" \ 34 | --logging_steps 1 \ 35 | --tf32 True \ 36 | --model_max_length 2048 \ 37 | --gradient_checkpointing True \ 38 | --dataloader_num_workers 4 \ 39 | --lazy_preprocess True \ 40 | --report_to wandb 41 | 42 | 43 | deepspeed mgm/train/train_mem.py \ 44 | --deepspeed ./scripts/zero2.json \ 45 | --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \ 46 | --version gemma \ 47 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 48 | --image_folder ./data/MGM-Finetune \ 49 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 50 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 51 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 52 | --mm_projector_type mlp2x_gelu \ 53 | --mm_vision_select_layer -2 \ 54 | --mm_use_im_start_end False \ 55 | --mm_use_im_patch_token False \ 56 | --image_aspect_ratio pad \ 57 | --group_by_modality_length True \ 58 | --image_size_aux $AUX_SIZE \ 59 | --bf16 True \ 60 | --output_dir ./work_dirs/$FINETUNE_NAME \ 61 | --num_train_epochs 1 \ 62 | --per_device_train_batch_size 8 \ 63 | --per_device_eval_batch_size 4 \ 64 | --gradient_accumulation_steps 2 \ 65 | --evaluation_strategy "no" \ 66 | --save_strategy "steps" \ 67 | --save_steps 1000 \ 68 | --save_total_limit 1 \ 69 | --learning_rate 2e-5 \ 70 | --weight_decay 0. \ 71 | --warmup_ratio 0.03 \ 72 | --lr_scheduler_type "cosine" \ 73 | --logging_steps 1 \ 74 | --tf32 True \ 75 | --model_max_length 2048 \ 76 | --gradient_checkpointing True \ 77 | --dataloader_num_workers 4 \ 78 | --lazy_preprocess True \ 79 | --report_to wandb 80 | -------------------------------------------------------------------------------- /scripts/gemma/train/stage_2_full_gemma_v2b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-2B-Pretrain 3 | FINETUNE_NAME=MGM-2B-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | 8 | deepspeed --hostfile hostfile_4 \ 9 | mgm/train/train_mem.py \ 10 | --deepspeed ./scripts/zero2.json \ 11 | --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \ 12 | --version gemma \ 13 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 14 | --image_folder ./data/MGM-Finetune \ 15 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 16 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 17 | --image_grid $IMAGE_GRID \ 18 | --image_global $IMAGE_GLOBAL \ 19 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 20 | --mm_projector_type mlp2x_gelu \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --image_aspect_ratio pad \ 25 | --group_by_modality_length True \ 26 | --image_size_aux $AUX_SIZE \ 27 | --bf16 True \ 28 | --output_dir ./work_dirs/$FINETUNE_NAME \ 29 | --num_train_epochs 1 \ 30 | --per_device_train_batch_size 4 \ 31 | --per_device_eval_batch_size 4 \ 32 | --gradient_accumulation_steps 1 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 20000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 2e-5 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 True \ 43 | --model_max_length 4096 \ 44 | --gradient_checkpointing True \ 45 | --dataloader_num_workers 4 \ 46 | --lazy_preprocess True \ 47 | --report_to wandb 48 | -------------------------------------------------------------------------------- /scripts/llama/eval/math_vista.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-7B-HD" 9 | OPENAIKEY="" 10 | OPENAIBASE="" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \ 14 | --model-path work_dirs/$CKPT \ 15 | --question-file data/MGM-Eval/MathVista/testmini.json \ 16 | --image-folder data/MGM-Eval/MathVista \ 17 | --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl 27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MathVista/extract_answer.py \ 38 | --output_file $output_file \ 39 | --api_key $OPENAIKEY \ 40 | --api_base $OPENAIBASE 41 | 42 | python mgm/eval/MathVista/calculate_score.py \ 43 | --output_file $output_file \ 44 | --score_file $score_file \ 45 | --gt_file data/MGM-Eval/MathVista/testmini.json 46 | -------------------------------------------------------------------------------- /scripts/llama/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-7B-HD" 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | CUDA_VISIBLE_DEVICES=1 python -m mgm.eval.model_vqa_mmbench \ 7 | --model-path ./work_dirs/$CKPT \ 8 | --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode vicuna_v1 13 | 14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment $CKPT 21 | -------------------------------------------------------------------------------- /scripts/llama/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-7B-HD" 4 | 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \ 6 | --model-path work_dirs/$CKPT \ 7 | --question-file data/MGM-Eval/MME/llava_mme.jsonl \ 8 | --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \ 9 | --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --conv-mode vicuna_v1 12 | 13 | cd data/MGM-Eval/MME 14 | 15 | python convert_answer_to_mme.py --experiment $CKPT 16 | 17 | cd eval_tool 18 | 19 | python calculation.py --results_dir answers/$CKPT 20 | -------------------------------------------------------------------------------- /scripts/llama/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-7B-HD" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "validation" \ 21 | --conv-mode vicuna_v1 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json -------------------------------------------------------------------------------- /scripts/llama/eval/mmmu_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-7B-HD" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "test" \ 21 | --conv-mode vicuna_v1 & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | 37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json -------------------------------------------------------------------------------- /scripts/llama/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-7B-HD" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \ 12 | --model-path work_dirs/$CKPT \ 13 | --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \ 14 | --image-folder data/MGM-Eval/mm-vet/images \ 15 | --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode vicuna_v1 & 20 | done 21 | 22 | wait 23 | 24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl 25 | # Clear out the output file if it exists. 26 | > "$output_file" 27 | 28 | # Loop through the indices and concatenate each file. 29 | for IDX in $(seq 0 $((CHUNKS-1))); do 30 | cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 31 | done 32 | 33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT 34 | 35 | python scripts/convert_mmvet_for_eval.py \ 36 | --src $output_file \ 37 | --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json 38 | 39 | -------------------------------------------------------------------------------- /scripts/llama/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-7B-HD" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \ 13 | --model-path ./work_dirs/$CKPT \ 14 | --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 15 | --image-folder ./data/MGM-Eval/textvqa/train_images \ 16 | --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode vicuna_v1 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python -m mgm.eval.eval_textvqa \ 36 | --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \ 37 | --result-file $output_file 38 | -------------------------------------------------------------------------------- /scripts/llama/train/stage_1_2_full_v13b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-13B-Pretrain 3 | FINETUNE_NAME=MGM-13B 4 | AUX_SIZE=768 5 | 6 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 7 | 8 | deepspeed --hostfile hostfile_4 \ 9 | mgm/train/train_mem.py \ 10 | --deepspeed ./scripts/zero2.json \ 11 | --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \ 12 | --version plain \ 13 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 14 | --image_folder ./data/MGM-Pretrain \ 15 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 16 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 17 | --mm_projector_type mlp2x_gelu \ 18 | --tune_mm_mlp_adapter True \ 19 | --mm_vision_select_layer -2 \ 20 | --mm_use_im_start_end False \ 21 | --mm_use_im_patch_token False \ 22 | --image_size_aux $AUX_SIZE \ 23 | --bf16 True \ 24 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 8 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 1 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | 45 | 46 | deepspeed --hostfile hostfile_4 \ 47 | mgm/train/train_mem.py \ 48 | --deepspeed ./scripts/zero2.json \ 49 | --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \ 50 | --version v1 \ 51 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 52 | --image_folder ./data/MGM-Finetune \ 53 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 54 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 55 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 56 | --mm_projector_type mlp2x_gelu \ 57 | --mm_vision_select_layer -2 \ 58 | --mm_use_im_start_end False \ 59 | --mm_use_im_patch_token False \ 60 | --image_aspect_ratio pad \ 61 | --group_by_modality_length True \ 62 | --image_size_aux $AUX_SIZE \ 63 | --bf16 True \ 64 | --output_dir ./work_dirs/$FINETUNE_NAME \ 65 | --num_train_epochs 1 \ 66 | --per_device_train_batch_size 4 \ 67 | --per_device_eval_batch_size 4 \ 68 | --gradient_accumulation_steps 1 \ 69 | --evaluation_strategy "no" \ 70 | --save_strategy "steps" \ 71 | --save_steps 1000 \ 72 | --save_total_limit 1 \ 73 | --learning_rate 2e-5 \ 74 | --weight_decay 0. \ 75 | --warmup_ratio 0.03 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --tf32 True \ 79 | --model_max_length 2048 \ 80 | --gradient_checkpointing True \ 81 | --dataloader_num_workers 4 \ 82 | --lazy_preprocess True \ 83 | --report_to wandb 84 | -------------------------------------------------------------------------------- /scripts/llama/train/stage_1_2_full_v7b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-7B-Pretrain 3 | FINETUNE_NAME=MGM-7B 4 | AUX_SIZE=768 5 | 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine 7 | 8 | deepspeed --hostfile hostfile \ 9 | mgm/train/train_mem.py \ 10 | --deepspeed ./scripts/zero2_offload.json \ 11 | --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \ 12 | --version plain \ 13 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 14 | --image_folder ./data/MGM-Pretrain \ 15 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 16 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 17 | --mm_projector_type mlp2x_gelu \ 18 | --tune_mm_mlp_adapter True \ 19 | --mm_vision_select_layer -2 \ 20 | --mm_use_im_start_end False \ 21 | --mm_use_im_patch_token False \ 22 | --image_size_aux $AUX_SIZE \ 23 | --bf16 True \ 24 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 16 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 1 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | 45 | 46 | deepspeed --hostfile hostfile \ 47 | mgm/train/train_mem.py \ 48 | --deepspeed ./scripts/zero2_offload.json \ 49 | --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \ 50 | --version v1 \ 51 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 52 | --image_folder ./data/MGM-Finetune \ 53 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 54 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 55 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 56 | --mm_projector_type mlp2x_gelu \ 57 | --mm_vision_select_layer -2 \ 58 | --mm_use_im_start_end False \ 59 | --mm_use_im_patch_token False \ 60 | --image_aspect_ratio pad \ 61 | --group_by_modality_length True \ 62 | --image_size_aux $AUX_SIZE \ 63 | --bf16 True \ 64 | --output_dir ./work_dirs/$FINETUNE_NAME \ 65 | --num_train_epochs 1 \ 66 | --per_device_train_batch_size 8 \ 67 | --per_device_eval_batch_size 4 \ 68 | --gradient_accumulation_steps 1 \ 69 | --evaluation_strategy "no" \ 70 | --save_strategy "steps" \ 71 | --save_steps 1000 \ 72 | --save_total_limit 1 \ 73 | --learning_rate 2e-5 \ 74 | --weight_decay 0. \ 75 | --warmup_ratio 0.03 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --tf32 True \ 79 | --model_max_length 2048 \ 80 | --gradient_checkpointing True \ 81 | --dataloader_num_workers 4 \ 82 | --lazy_preprocess True \ 83 | --report_to wandb 84 | -------------------------------------------------------------------------------- /scripts/llama/train/stage_1_2_full_v7b_336_hr_768_nodp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-7B-Pretrain 3 | FINETUNE_NAME=MGM-7B 4 | AUX_SIZE=768 5 | DROP_PATH=False 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine 7 | 8 | deepspeed --hostfile hostfile \ 9 | mgm/train/train_mem.py \ 10 | --deepspeed ./scripts/zero2_offload.json \ 11 | --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \ 12 | --version plain \ 13 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 14 | --image_folder ./data/MGM-Pretrain \ 15 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 16 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 17 | --mm_projector_type mlp2x_gelu \ 18 | --tune_mm_mlp_adapter True \ 19 | --mm_vision_select_layer -2 \ 20 | --mm_use_im_start_end False \ 21 | --mm_use_im_patch_token False \ 22 | --image_size_aux $AUX_SIZE \ 23 | --drop_path $DROP_PATH \ 24 | --bf16 True \ 25 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 16 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | 46 | 47 | deepspeed --hostfile hostfile \ 48 | mgm/train/train_mem.py \ 49 | --deepspeed ./scripts/zero2_offload.json \ 50 | --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \ 51 | --version v1 \ 52 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 53 | --image_folder ./data/MGM-Finetune \ 54 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 55 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 56 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 57 | --mm_projector_type mlp2x_gelu \ 58 | --mm_vision_select_layer -2 \ 59 | --mm_use_im_start_end False \ 60 | --mm_use_im_patch_token False \ 61 | --image_aspect_ratio pad \ 62 | --group_by_modality_length True \ 63 | --image_size_aux $AUX_SIZE \ 64 | --drop_path $DROP_PATH \ 65 | --bf16 True \ 66 | --output_dir ./work_dirs/$FINETUNE_NAME \ 67 | --num_train_epochs 1 \ 68 | --per_device_train_batch_size 8 \ 69 | --per_device_eval_batch_size 4 \ 70 | --gradient_accumulation_steps 1 \ 71 | --evaluation_strategy "no" \ 72 | --save_strategy "steps" \ 73 | --save_steps 1000 \ 74 | --save_total_limit 1 \ 75 | --learning_rate 2e-5 \ 76 | --weight_decay 0. \ 77 | --warmup_ratio 0.03 \ 78 | --lr_scheduler_type "cosine" \ 79 | --logging_steps 1 \ 80 | --tf32 True \ 81 | --model_max_length 2048 \ 82 | --gradient_checkpointing True \ 83 | --dataloader_num_workers 4 \ 84 | --lazy_preprocess True \ 85 | --report_to wandb 86 | -------------------------------------------------------------------------------- /scripts/llama/train/stage_2_full_v13b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-13B-Pretrain 3 | FINETUNE_NAME=MGM-13B-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | 8 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 9 | 10 | deepspeed --hostfile hostfile_4 \ 11 | mgm/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \ 14 | --version v1 \ 15 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 16 | --image_folder ./data/MGM-Finetune \ 17 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 18 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 19 | --image_grid $IMAGE_GRID \ 20 | --image_global $IMAGE_GLOBAL \ 21 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --image_size_aux $AUX_SIZE \ 29 | --bf16 True \ 30 | --output_dir ./work_dirs/$FINETUNE_NAME \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 4 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 1000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 4096 \ 46 | --gradient_checkpointing True \ 47 | --dataloader_num_workers 4 \ 48 | --lazy_preprocess True \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/llama/train/stage_2_full_v7b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-7B-Pretrain 3 | FINETUNE_NAME=MGM-7B-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | 8 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 9 | 10 | deepspeed --hostfile hostfile_4 \ 11 | mgm/train/train_mem.py \ 12 | --deepspeed ./scripts/zero2.json \ 13 | --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \ 14 | --version v1 \ 15 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 16 | --image_folder ./data/MGM-Finetune \ 17 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 18 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 19 | --image_grid $IMAGE_GRID \ 20 | --image_global $IMAGE_GLOBAL \ 21 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --image_size_aux $AUX_SIZE \ 29 | --bf16 True \ 30 | --output_dir ./work_dirs/$FINETUNE_NAME \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 4 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 1 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 1000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 4096 \ 46 | --gradient_checkpointing True \ 47 | --dataloader_num_workers 4 \ 48 | --lazy_preprocess True \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/llama3/eval/math_vista.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM-8B-LLaMA-3-HD" 9 | OPENAIKEY="" 10 | OPENAIBASE="" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \ 14 | --model-path work_dirs/$CKPT \ 15 | --question-file data/MGM-Eval/MathVista/testmini.json \ 16 | --image-folder data/MGM-Eval/MathVista \ 17 | --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode llama_3 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl 27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MathVista/extract_answer.py \ 38 | --output_file $output_file \ 39 | --api_key $OPENAIKEY \ 40 | --api_base $OPENAIBASE 41 | 42 | python mgm/eval/MathVista/calculate_score.py \ 43 | --output_file $output_file \ 44 | --score_file $score_file \ 45 | --gt_file data/MGM-Eval/MathVista/testmini.json 46 | -------------------------------------------------------------------------------- /scripts/llama3/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM-8B-LLaMA-3-HD" 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | CUDA_VISIBLE_DEVICES=1 python -m mgm.eval.model_vqa_mmbench \ 7 | --model-path ./work_dirs/$CKPT \ 8 | --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode llama_3 13 | 14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment $CKPT 21 | -------------------------------------------------------------------------------- /scripts/llama3/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM-8B-LLaMA-3-HD" 4 | 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \ 6 | --model-path work_dirs/$CKPT \ 7 | --question-file data/MGM-Eval/MME/llava_mme.jsonl \ 8 | --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \ 9 | --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --conv-mode llama_3 12 | 13 | cd data/MGM-Eval/MME 14 | 15 | python convert_answer_to_mme.py --experiment $CKPT 16 | 17 | cd eval_tool 18 | 19 | python calculation.py --results_dir answers/$CKPT 20 | -------------------------------------------------------------------------------- /scripts/llama3/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM-8B-LLaMA-3-HD" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "validation" \ 21 | --conv-mode llama_3 & 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json -------------------------------------------------------------------------------- /scripts/llama3/eval/mmmu_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM-8B-LLaMA-3-HD" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "test" \ 21 | --conv-mode llama_3 & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | 37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json -------------------------------------------------------------------------------- /scripts/llama3/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM-8B-LLaMA-3-HD" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \ 12 | --model-path work_dirs/$CKPT \ 13 | --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \ 14 | --image-folder data/MGM-Eval/mm-vet/images \ 15 | --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode llama_3 & 20 | done 21 | 22 | wait 23 | 24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl 25 | # Clear out the output file if it exists. 26 | > "$output_file" 27 | 28 | # Loop through the indices and concatenate each file. 29 | for IDX in $(seq 0 $((CHUNKS-1))); do 30 | cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 31 | done 32 | 33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT 34 | 35 | python scripts/convert_mmvet_for_eval.py \ 36 | --src $output_file \ 37 | --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json 38 | 39 | -------------------------------------------------------------------------------- /scripts/llama3/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM-8B-LLaMA-3-HD" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \ 13 | --model-path ./work_dirs/$CKPT \ 14 | --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 15 | --image-folder ./data/MGM-Eval/textvqa/train_images \ 16 | --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --conv-mode llama_3 & 21 | done 22 | 23 | wait 24 | 25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl 26 | 27 | # Clear out the output file if it exists. 28 | > "$output_file" 29 | 30 | # Loop through the indices and concatenate each file. 31 | for IDX in $(seq 0 $((CHUNKS-1))); do 32 | cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 33 | done 34 | 35 | python -m mgm.eval.eval_textvqa \ 36 | --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \ 37 | --result-file $output_file 38 | -------------------------------------------------------------------------------- /scripts/llama3/train/stage_1_2_full_v8b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-8B-LLaMA-3-Pretrain 3 | FINETUNE_NAME=MGM-8B-LLaMA-3 4 | AUX_SIZE=768 5 | 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine 7 | 8 | deepspeed --hostfile ../hostfile \ 9 | mgm/train/train_mem.py \ 10 | --deepspeed ./scripts/zero2_offload.json \ 11 | --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \ 12 | --version llama_3 \ 13 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 14 | --image_folder ./data/MGM-Pretrain \ 15 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 16 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 17 | --mm_projector_type mlp2x_gelu \ 18 | --tune_mm_mlp_adapter True \ 19 | --mm_vision_select_layer -2 \ 20 | --mm_use_im_start_end False \ 21 | --mm_use_im_patch_token False \ 22 | --image_size_aux $AUX_SIZE \ 23 | --bf16 True \ 24 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 25 | --num_train_epochs 1 \ 26 | --per_device_train_batch_size 16 \ 27 | --per_device_eval_batch_size 4 \ 28 | --gradient_accumulation_steps 1 \ 29 | --evaluation_strategy "no" \ 30 | --save_strategy "steps" \ 31 | --save_steps 24000 \ 32 | --save_total_limit 1 \ 33 | --learning_rate 1e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 4 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | 45 | 46 | deepspeed --hostfile ../hostfile \ 47 | mgm/train/train_mem.py \ 48 | --deepspeed ./scripts/zero2_offload.json \ 49 | --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \ 50 | --version llama_3 \ 51 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 52 | --image_folder ./data/MGM-Finetune \ 53 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 54 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 55 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 56 | --mm_projector_type mlp2x_gelu \ 57 | --mm_vision_select_layer -2 \ 58 | --mm_use_im_start_end False \ 59 | --mm_use_im_patch_token False \ 60 | --image_aspect_ratio pad \ 61 | --group_by_modality_length True \ 62 | --image_size_aux $AUX_SIZE \ 63 | --bf16 True \ 64 | --output_dir ./work_dirs/$FINETUNE_NAME \ 65 | --num_train_epochs 1 \ 66 | --per_device_train_batch_size 8 \ 67 | --per_device_eval_batch_size 4 \ 68 | --gradient_accumulation_steps 1 \ 69 | --evaluation_strategy "no" \ 70 | --save_strategy "steps" \ 71 | --save_steps 1000 \ 72 | --save_total_limit 1 \ 73 | --learning_rate 2e-5 \ 74 | --weight_decay 0. \ 75 | --warmup_ratio 0.03 \ 76 | --lr_scheduler_type "cosine" \ 77 | --logging_steps 1 \ 78 | --tf32 True \ 79 | --model_max_length 2048 \ 80 | --gradient_checkpointing True \ 81 | --dataloader_num_workers 4 \ 82 | --lazy_preprocess True \ 83 | --report_to wandb 84 | -------------------------------------------------------------------------------- /scripts/llama3/train/stage_2_full_v8b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-8B-LLaMA-3-Pretrain 3 | FINETUNE_NAME=MGM-8B-LLaMA-3-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | 8 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine 9 | 10 | deepspeed --hostfile ../hostfile \ 11 | mgm/train/train_mem.py \ 12 | --deepspeed ./scripts/zero3.json \ 13 | --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \ 14 | --version llama_3 \ 15 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 16 | --image_folder ./data/MGM-Finetune \ 17 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 18 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 19 | --image_grid $IMAGE_GRID \ 20 | --image_global $IMAGE_GLOBAL \ 21 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 22 | --mm_projector_type mlp2x_gelu \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --image_aspect_ratio pad \ 27 | --group_by_modality_length True \ 28 | --image_size_aux $AUX_SIZE \ 29 | --bf16 True \ 30 | --output_dir ./work_dirs/$FINETUNE_NAME \ 31 | --num_train_epochs 1 \ 32 | --per_device_train_batch_size 4 \ 33 | --per_device_eval_batch_size 4 \ 34 | --gradient_accumulation_steps 2 \ 35 | --evaluation_strategy "no" \ 36 | --save_strategy "steps" \ 37 | --save_steps 1000 \ 38 | --save_total_limit 1 \ 39 | --learning_rate 2e-5 \ 40 | --weight_decay 0. \ 41 | --warmup_ratio 0.03 \ 42 | --lr_scheduler_type "cosine" \ 43 | --logging_steps 1 \ 44 | --tf32 True \ 45 | --model_max_length 4096 \ 46 | --gradient_checkpointing True \ 47 | --dataloader_num_workers 4 \ 48 | --lazy_preprocess True \ 49 | --report_to wandb 50 | -------------------------------------------------------------------------------- /scripts/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from mgm.model.builder import load_pretrained_model 3 | from mgm.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu') 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/mixtral/eval/math_vista.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-8x7B" 9 | OPENAIKEY="" 10 | OPENAIBASE="" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \ 14 | --model-path work_dirs/$CKPT \ 15 | --question-file data/MGM-Eval/MathVista/testmini.json \ 16 | --image-folder data/MGM-Eval/MathVista \ 17 | --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode mistral_instruct & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl 27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MathVista/extract_answer.py \ 38 | --output_file $output_file \ 39 | --api_key $OPENAIKEY \ 40 | --api_base $OPENAIBASE 41 | 42 | python mgm/eval/MathVista/calculate_score.py \ 43 | --output_file $output_file \ 44 | --score_file $score_file \ 45 | --gt_file data/MGM-Eval/MathVista/testmini.json 46 | -------------------------------------------------------------------------------- /scripts/mixtral/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-8x7B" 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | python -m mgm.eval.model_vqa_mmbench \ 7 | --model-path ./work_dirs/$CKPT \ 8 | --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode mistral_instruct 13 | 14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment $CKPT 21 | -------------------------------------------------------------------------------- /scripts/mixtral/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CKPT="MGM/MGM-8x7B" 3 | 4 | python -m mgm.eval.model_vqa_loader \ 5 | --model-path work_dirs/$CKPT \ 6 | --question-file data/MGM-Eval/MME/llava_mme.jsonl \ 7 | --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \ 8 | --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \ 9 | --temperature 0 \ 10 | --conv-mode mistral_instruct 11 | 12 | 13 | cd data/MGM-Eval/MME 14 | 15 | python convert_answer_to_mme.py --experiment $CKPT 16 | 17 | cd eval_tool 18 | 19 | python calculation.py --results_dir answers/$CKPT 20 | -------------------------------------------------------------------------------- /scripts/mixtral/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-8x7B" 4 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 5 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 6 | 7 | # Clear out the output file if it exists. 8 | > "$output_file" 9 | 10 | python mgm/eval/MMMU/eval/run_llava.py \ 11 | --data_path ./data/MGM-Eval/MMMU \ 12 | --config_path $CONFIG \ 13 | --model_path ./work_dirs/$CKPT \ 14 | --answers-file $output_file \ 15 | --split "validation" \ 16 | --conv-mode mistral_instruct 17 | 18 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json -------------------------------------------------------------------------------- /scripts/mixtral/eval/mmmu_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-8x7B" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "test" \ 21 | --load_8bit True \ 22 | --conv-mode mistral_instruct & 23 | done 24 | 25 | wait 26 | 27 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json -------------------------------------------------------------------------------- /scripts/mixtral/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-8x7B" 9 | 10 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl 11 | # Clear out the output file if it exists. 12 | > "$output_file" 13 | 14 | python -m mgm.eval.model_vqa \ 15 | --model-path work_dirs/$CKPT \ 16 | --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \ 17 | --image-folder data/MGM-Eval/mm-vet/images \ 18 | --answers-file $output_file \ 19 | --temperature 0 \ 20 | --conv-mode mistral_instruct 21 | 22 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT 23 | 24 | python scripts/convert_mmvet_for_eval.py \ 25 | --src $output_file \ 26 | --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json 27 | 28 | -------------------------------------------------------------------------------- /scripts/mixtral/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-8x7B" 4 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl 5 | # Clear out the output file if it exists. 6 | > "$output_file" 7 | 8 | python -m mgm.eval.model_vqa_loader \ 9 | --model-path ./work_dirs/$CKPT \ 10 | --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 11 | --image-folder ./data/MGM-Eval/textvqa/train_images \ 12 | --answers-file $output_file \ 13 | --temperature 0 \ 14 | --conv-mode mistral_instruct 15 | 16 | python -m mgm.eval.eval_textvqa \ 17 | --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \ 18 | --result-file $output_file 19 | -------------------------------------------------------------------------------- /scripts/mixtral/train/stage_1_2_full_mixtral_8x7b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-8x7B-Pretrain 3 | FINETUNE_NAME=MGM-8x7B 4 | AUX_SIZE=768 5 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2" 6 | 7 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 8 | 9 | deepspeed --hostfile hostfile_4 \ 10 | mgm/train/train_mem.py \ 11 | --deepspeed ./scripts/zero3.json \ 12 | --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \ 13 | --version plain \ 14 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 15 | --image_folder ./data/MGM-Pretrain \ 16 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 17 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --image_size_aux $AUX_SIZE \ 24 | --bf16 True \ 25 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | 46 | 47 | deepspeed --hostfile hostfile_4 \ 48 | mgm/train/train_mem.py \ 49 | --deepspeed ./scripts/zero3.json \ 50 | --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \ 51 | --version mistral_instruct \ 52 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 53 | --image_folder ./data/MGM-Finetune \ 54 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 55 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 56 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 57 | --mm_projector_type mlp2x_gelu \ 58 | --mm_vision_select_layer -2 \ 59 | --mm_use_im_start_end False \ 60 | --mm_use_im_patch_token False \ 61 | --image_aspect_ratio pad \ 62 | --group_by_modality_length True \ 63 | --image_size_aux $AUX_SIZE \ 64 | --bf16 True \ 65 | --output_dir ./work_dirs/$FINETUNE_NAME \ 66 | --num_train_epochs 1 \ 67 | --per_device_train_batch_size 4 \ 68 | --per_device_eval_batch_size 4 \ 69 | --gradient_accumulation_steps 1 \ 70 | --evaluation_strategy "no" \ 71 | --save_strategy "steps" \ 72 | --save_steps 1000 \ 73 | --save_total_limit 1 \ 74 | --learning_rate 1e-5 \ 75 | --lr_multi $LR_MULTI \ 76 | --weight_decay 0. \ 77 | --warmup_ratio 0.03 \ 78 | --lr_scheduler_type "cosine" \ 79 | --logging_steps 1 \ 80 | --tf32 True \ 81 | --model_max_length 2048 \ 82 | --gradient_checkpointing True \ 83 | --dataloader_num_workers 4 \ 84 | --lazy_preprocess True \ 85 | --report_to wandb 86 | -------------------------------------------------------------------------------- /scripts/mixtral/train/stage_2_full_mixtral_8x7b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-8x7B-Pretrain 3 | FINETUNE_NAME=MGM-8x7B-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2" 8 | 9 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 10 | 11 | deepspeed --hostfile hostfile_4 \ 12 | mgm/train/train_mem.py \ 13 | --deepspeed ./scripts/zero3.json \ 14 | --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \ 15 | --version mistral_instruct \ 16 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 17 | --image_folder ./data/MGM-Finetune \ 18 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 19 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 20 | --image_grid $IMAGE_GRID \ 21 | --image_global $IMAGE_GLOBAL \ 22 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 23 | --mm_projector_type mlp2x_gelu \ 24 | --mm_vision_select_layer -2 \ 25 | --mm_use_im_start_end False \ 26 | --mm_use_im_patch_token False \ 27 | --image_aspect_ratio pad \ 28 | --group_by_modality_length True \ 29 | --image_size_aux $AUX_SIZE \ 30 | --bf16 True \ 31 | --output_dir ./work_dirs/$FINETUNE_NAME \ 32 | --num_train_epochs 1 \ 33 | --per_device_train_batch_size 4 \ 34 | --per_device_eval_batch_size 4 \ 35 | --gradient_accumulation_steps 1 \ 36 | --evaluation_strategy "no" \ 37 | --save_strategy "steps" \ 38 | --save_steps 1000 \ 39 | --save_total_limit 1 \ 40 | --learning_rate 1e-5 \ 41 | --lr_multi $LR_MULTI \ 42 | --weight_decay 0. \ 43 | --warmup_ratio 0.03 \ 44 | --lr_scheduler_type "cosine" \ 45 | --logging_steps 1 \ 46 | --tf32 True \ 47 | --model_max_length 4096 \ 48 | --gradient_checkpointing True \ 49 | --dataloader_num_workers 4 \ 50 | --lazy_preprocess True \ 51 | --report_to wandb 52 | -------------------------------------------------------------------------------- /scripts/yi/eval/math_vista.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-34B" 9 | OPENAIKEY="" 10 | OPENAIBASE="" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \ 14 | --model-path work_dirs/$CKPT \ 15 | --question-file data/MGM-Eval/MathVista/testmini.json \ 16 | --image-folder data/MGM-Eval/MathVista \ 17 | --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --conv-mode chatml_direct & 22 | done 23 | 24 | wait 25 | 26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl 27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python mgm/eval/MathVista/extract_answer.py \ 38 | --output_file $output_file \ 39 | --api_key $OPENAIKEY \ 40 | --api_base $OPENAIBASE 41 | 42 | python mgm/eval/MathVista/calculate_score.py \ 43 | --output_file $output_file \ 44 | --score_file $score_file \ 45 | --gt_file data/MGM-Eval/MathVista/testmini.json 46 | -------------------------------------------------------------------------------- /scripts/yi/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-34B" 4 | SPLIT="mmbench_dev_20230712" 5 | 6 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_mmbench \ 7 | --model-path ./work_dirs/$CKPT \ 8 | --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 9 | --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \ 10 | --single-pred-prompt \ 11 | --temperature 0 \ 12 | --conv-mode chatml_direct 13 | 14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT 15 | 16 | python scripts/convert_mmbench_for_submission.py \ 17 | --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \ 18 | --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \ 19 | --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \ 20 | --experiment $CKPT 21 | -------------------------------------------------------------------------------- /scripts/yi/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CKPT="MGM/MGM-34B" 4 | 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \ 6 | --model-path work_dirs/$CKPT \ 7 | --question-file data/MGM-Eval/MME/llava_mme.jsonl \ 8 | --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \ 9 | --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \ 10 | --temperature 0 \ 11 | --conv-mode chatml_direct 12 | 13 | cd data/MGM-Eval/MME 14 | 15 | python convert_answer_to_mme.py --experiment $CKPT 16 | 17 | cd eval_tool 18 | 19 | python calculation.py --results_dir answers/$CKPT 20 | -------------------------------------------------------------------------------- /scripts/yi/eval/mmmu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-34B" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "validation" \ 21 | --conv-mode chatml_direct & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json -------------------------------------------------------------------------------- /scripts/yi/eval/mmmu_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-34B" 10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \ 14 | --data_path ./data/MGM-Eval/MMMU \ 15 | --config_path $CONFIG \ 16 | --model_path ./work_dirs/$CKPT \ 17 | --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --split "test" \ 21 | --conv-mode chatml_direct & #--load_8bit True \ use this if you want to load 8-bit model 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json -------------------------------------------------------------------------------- /scripts/yi/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT="MGM/MGM-34B" 9 | 10 | for IDX in $(seq 0 $((CHUNKS-1))); do 11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \ 12 | --model-path work_dirs/$CKPT \ 13 | --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \ 14 | --image-folder data/MGM-Eval/mm-vet/images \ 15 | --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 16 | --num-chunks $CHUNKS \ 17 | --chunk-idx $IDX \ 18 | --temperature 0 \ 19 | --conv-mode chatml_direct & 20 | done 21 | 22 | wait 23 | 24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl 25 | # Clear out the output file if it exists. 26 | > "$output_file" 27 | 28 | # Loop through the indices and concatenate each file. 29 | for IDX in $(seq 0 $((CHUNKS-1))); do 30 | cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 31 | done 32 | 33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT 34 | 35 | python scripts/convert_mmvet_for_eval.py \ 36 | --src $output_file \ 37 | --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json 38 | 39 | -------------------------------------------------------------------------------- /scripts/yi/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT="MGM/MGM-34B" 10 | 11 | for IDX in $(seq 0 $((CHUNKS-1))); do 12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \ 13 | --model-path ./work_dirs/$CKPT \ 14 | --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 15 | --image-folder ./data/MGM-Eval/textvqa/train_images \ 16 | --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \ 17 | --num-chunks $CHUNKS \ 18 | --chunk-idx $IDX \ 19 | --temperature 0 \ 20 | --load_8bit True \ 21 | --conv-mode chatml_direct & 22 | done 23 | 24 | wait 25 | 26 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl 27 | 28 | # Clear out the output file if it exists. 29 | > "$output_file" 30 | 31 | # Loop through the indices and concatenate each file. 32 | for IDX in $(seq 0 $((CHUNKS-1))); do 33 | cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 34 | done 35 | 36 | python -m mgm.eval.eval_textvqa \ 37 | --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \ 38 | --result-file $output_file 39 | -------------------------------------------------------------------------------- /scripts/yi/train/stage_1_2_full_yi34b_336_hr_768.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-34B-Pretrain 3 | FINETUNE_NAME=MGM-34B 4 | AUX_SIZE=768 5 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2" 6 | 7 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 8 | 9 | deepspeed --hostfile hostfile_4 \ 10 | mgm/train/train_mem.py \ 11 | --deepspeed ./scripts/zero3.json \ 12 | --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \ 13 | --version plain \ 14 | --data_path ./data/MGM-Pretrain/mgm_pretrain.json \ 15 | --image_folder ./data/MGM-Pretrain \ 16 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 17 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 18 | --mm_projector_type mlp2x_gelu \ 19 | --tune_mm_mlp_adapter True \ 20 | --mm_vision_select_layer -2 \ 21 | --mm_use_im_start_end False \ 22 | --mm_use_im_patch_token False \ 23 | --image_size_aux $AUX_SIZE \ 24 | --bf16 True \ 25 | --output_dir ./work_dirs/$PRETRAIN_NAME \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 8 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --save_total_limit 1 \ 34 | --learning_rate 1e-3 \ 35 | --weight_decay 0. \ 36 | --warmup_ratio 0.03 \ 37 | --lr_scheduler_type "cosine" \ 38 | --logging_steps 1 \ 39 | --tf32 True \ 40 | --model_max_length 2048 \ 41 | --gradient_checkpointing True \ 42 | --dataloader_num_workers 4 \ 43 | --lazy_preprocess True \ 44 | --report_to wandb 45 | 46 | 47 | deepspeed --hostfile hostfile_4 \ 48 | mgm/train/train_mem.py \ 49 | --deepspeed ./scripts/zero3.json \ 50 | --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \ 51 | --version chatml_direct \ 52 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 53 | --image_folder ./data/MGM-Finetune \ 54 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 55 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 56 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 57 | --mm_projector_type mlp2x_gelu \ 58 | --mm_vision_select_layer -2 \ 59 | --mm_use_im_start_end False \ 60 | --mm_use_im_patch_token False \ 61 | --image_aspect_ratio pad \ 62 | --group_by_modality_length True \ 63 | --image_size_aux $AUX_SIZE \ 64 | --bf16 True \ 65 | --output_dir ./work_dirs/$FINETUNE_NAME \ 66 | --num_train_epochs 1 \ 67 | --per_device_train_batch_size 4 \ 68 | --per_device_eval_batch_size 4 \ 69 | --gradient_accumulation_steps 1 \ 70 | --evaluation_strategy "no" \ 71 | --save_strategy "steps" \ 72 | --save_steps 1000 \ 73 | --save_total_limit 1 \ 74 | --learning_rate 1e-5 \ 75 | --lr_multi $LR_MULTI \ 76 | --weight_decay 0. \ 77 | --warmup_ratio 0.03 \ 78 | --lr_scheduler_type "cosine" \ 79 | --logging_steps 1 \ 80 | --tf32 True \ 81 | --model_max_length 2048 \ 82 | --gradient_checkpointing True \ 83 | --dataloader_num_workers 4 \ 84 | --lazy_preprocess True \ 85 | --report_to wandb 86 | -------------------------------------------------------------------------------- /scripts/yi/train/stage_2_full_yi34b_672_hr_1536.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PRETRAIN_NAME=MGM-34B-Pretrain 3 | FINETUNE_NAME=MGM-34B-HD 4 | AUX_SIZE=1536 5 | IMAGE_GRID=2 6 | IMAGE_GLOBAL=True 7 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2" 8 | 9 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine 10 | 11 | 12 | deepspeed --hostfile hostfile_4 \ 13 | mgm/train/train_mem.py \ 14 | --deepspeed ./scripts/zero3.json \ 15 | --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \ 16 | --version chatml_direct \ 17 | --data_path ./data/MGM-Finetune/mgm_instruction.json \ 18 | --image_folder ./data/MGM-Finetune \ 19 | --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \ 20 | --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \ 21 | --image_grid $IMAGE_GRID \ 22 | --image_global $IMAGE_GLOBAL \ 23 | --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \ 24 | --mm_projector_type mlp2x_gelu \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --image_aspect_ratio pad \ 29 | --group_by_modality_length True \ 30 | --image_size_aux $AUX_SIZE \ 31 | --bf16 True \ 32 | --output_dir ./work_dirs/$FINETUNE_NAME \ 33 | --num_train_epochs 1 \ 34 | --per_device_train_batch_size 4 \ 35 | --per_device_eval_batch_size 4 \ 36 | --gradient_accumulation_steps 1 \ 37 | --evaluation_strategy "no" \ 38 | --save_strategy "steps" \ 39 | --save_steps 1000 \ 40 | --save_total_limit 1 \ 41 | --learning_rate 1e-5 \ 42 | --lr_multi $LR_MULTI \ 43 | --weight_decay 0. \ 44 | --warmup_ratio 0.03 \ 45 | --lr_scheduler_type "cosine" \ 46 | --logging_steps 1 \ 47 | --tf32 True \ 48 | --model_max_length 4096 \ 49 | --gradient_checkpointing True \ 50 | --dataloader_num_workers 4 \ 51 | --lazy_preprocess True \ 52 | --report_to wandb 53 | -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "offload_optimizer": { 19 | "device": "cpu", 20 | "pin_memory": true 21 | }, 22 | "offload_param": { 23 | "device": "cpu", 24 | "pin_memory": true 25 | }, 26 | "overlap_comm": true, 27 | "contiguous_gradients": true, 28 | "sub_group_size": 1e9, 29 | "reduce_bucket_size": "auto" 30 | } 31 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } --------------------------------------------------------------------------------