├── LICENSE
├── README.md
├── cog.yaml
├── hostfile
├── hostfile_4
├── images
    ├── demo_gen.png
    ├── demo_und.png
    ├── pipeline.png
    └── teaser.png
├── mgm
    ├── __init__.py
    ├── constants.py
    ├── conversation.py
    ├── eval
    │   ├── MMMU
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── eval
    │   │   │   ├── README.md
    │   │   │   ├── answer_dict_val.json
    │   │   │   ├── configs
    │   │   │   │   └── llava1.5.yaml
    │   │   │   ├── convert_to_test.py
    │   │   │   ├── eval.py
    │   │   │   ├── example_outputs
    │   │   │   │   ├── llava1.5_13b
    │   │   │   │   │   ├── Accounting
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Agriculture
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Architecture_and_Engineering
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Art
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Art_Theory
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Basic_Medical_Science
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Biology
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Chemistry
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Clinical_Medicine
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Computer_Science
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Design
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Diagnostics_and_Laboratory_Medicine
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Economics
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Electronics
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Energy_and_Power
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Finance
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Geography
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── History
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Literature
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Manage
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Marketing
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Materials
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Math
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Mechanical_Engineering
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Music
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Pharmacy
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Physics
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Psychology
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Public_Health
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   ├── Sociology
    │   │   │   │   │   │   └── output.json
    │   │   │   │   │   └── total_val_output.json
    │   │   │   │   ├── llava1.5_13b_val.json
    │   │   │   │   └── qwen_vl
    │   │   │   │   │   ├── Accounting
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Agriculture
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Architecture_and_Engineering
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Art
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Art_Theory
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Basic_Medical_Science
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Biology
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Chemistry
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Clinical_Medicine
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Computer_Science
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Design
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Diagnostics_and_Laboratory_Medicine
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Economics
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Electronics
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Energy_and_Power
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Finance
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Geography
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── History
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Literature
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Manage
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Marketing
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Materials
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Math
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Mechanical_Engineering
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Music
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Pharmacy
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Physics
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Psychology
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Public_Health
    │   │   │   │   │       └── output.json
    │   │   │   │   │   ├── Sociology
    │   │   │   │   │       └── output.json
    │   │   │   │   │   └── total_val_output.json
    │   │   │   ├── main_eval_only.py
    │   │   │   ├── main_parse_and_eval.py
    │   │   │   ├── print_results.py
    │   │   │   ├── run_llava.py
    │   │   │   └── utils
    │   │   │   │   ├── data_utils.py
    │   │   │   │   ├── eval_utils.py
    │   │   │   │   └── model_utils.py
    │   │   └── image.png
    │   ├── MathVista
    │   │   ├── calculate_score.py
    │   │   ├── extract_answer.py
    │   │   ├── prompts
    │   │   │   └── ext_ans.py
    │   │   └── utilities.py
    │   ├── eval_gpt_review.py
    │   ├── eval_gpt_review_bench.py
    │   ├── eval_gpt_review_visual.py
    │   ├── eval_pope.py
    │   ├── eval_science_qa.py
    │   ├── eval_science_qa_gpt4.py
    │   ├── eval_science_qa_gpt4_requery.py
    │   ├── eval_textvqa.py
    │   ├── generate_webpage_data_from_table.py
    │   ├── m4c_evaluator.py
    │   ├── model_math_vista.py
    │   ├── model_qa.py
    │   ├── model_vqa.py
    │   ├── model_vqa_loader.py
    │   ├── model_vqa_mmbench.py
    │   ├── model_vqa_qbench.py
    │   ├── model_vqa_science.py
    │   ├── qa_baseline_gpt35.py
    │   ├── run_llava.py
    │   ├── summarize_gpt_review.py
    │   └── webpage
    │   │   ├── figures
    │   │       ├── alpaca.png
    │   │       ├── bard.jpg
    │   │       ├── chatgpt.svg
    │   │       ├── llama.jpg
    │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │       └── vicuna.jpeg
    │   │   ├── index.html
    │   │   ├── script.js
    │   │   └── styles.css
    ├── mm_utils.py
    ├── model
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── consolidate.py
    │   ├── language_model
    │   │   ├── mgm_gemma.py
    │   │   ├── mgm_llama.py
    │   │   ├── mgm_mistral.py
    │   │   └── mgm_mixtral.py
    │   ├── llava_arch.py
    │   ├── mgm_arch.py
    │   ├── multimodal_encoder
    │   │   ├── builder.py
    │   │   ├── clip_encoder.py
    │   │   ├── eva_encoder.py
    │   │   └── openclip_encoder.py
    │   ├── multimodal_projector
    │   │   └── builder.py
    │   └── processor
    │   │   └── video_processor.py
    ├── serve
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── controller.py
    │   ├── examples
    │   │   ├── extreme_ironing.jpg
    │   │   ├── monday.jpg
    │   │   ├── waterview.jpg
    │   │   └── woolen.png
    │   ├── gradio_web_server.py
    │   ├── model_worker.py
    │   ├── register_worker.py
    │   ├── sglang_worker.py
    │   └── test_message.py
    ├── train
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_xformers_attn_monkey_patch.py
    │   ├── llava_trainer.py
    │   ├── train.py
    │   ├── train_mem.py
    │   └── train_xformers.py
    └── utils.py
├── predict.py
├── pyproject.toml
└── scripts
    ├── convert_gqa_for_eval.py
    ├── convert_mmbench_for_submission.py
    ├── convert_mmvet_for_eval.py
    ├── convert_seed_for_submission.py
    ├── extract_mm_projector.py
    ├── gemma
        ├── eval
        │   ├── math_vista.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmmu_test.sh
        │   ├── mmvet.sh
        │   └── textvqa.sh
        └── train
        │   ├── stage_1_2_full_gemma_v2b_336_hr_768.sh
        │   └── stage_2_full_gemma_v2b_672_hr_1536.sh
    ├── llama
        ├── eval
        │   ├── math_vista.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmmu_test.sh
        │   ├── mmvet.sh
        │   └── textvqa.sh
        └── train
        │   ├── stage_1_2_full_v13b_336_hr_768.sh
        │   ├── stage_1_2_full_v7b_336_hr_768.sh
        │   ├── stage_1_2_full_v7b_336_hr_768_nodp.sh
        │   ├── stage_2_full_v13b_672_hr_1536.sh
        │   └── stage_2_full_v7b_672_hr_1536.sh
    ├── llama3
        ├── eval
        │   ├── math_vista.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmmu_test.sh
        │   ├── mmvet.sh
        │   └── textvqa.sh
        └── train
        │   ├── stage_1_2_full_v8b_336_hr_768.sh
        │   └── stage_2_full_v8b_672_hr_1536.sh
    ├── merge_lora_weights.py
    ├── mixtral
        ├── eval
        │   ├── math_vista.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmmu_test.sh
        │   ├── mmvet.sh
        │   └── textvqa.sh
        └── train
        │   ├── stage_1_2_full_mixtral_8x7b_336_hr_768.sh
        │   └── stage_2_full_mixtral_8x7b_672_hr_1536.sh
    ├── yi
        ├── eval
        │   ├── math_vista.sh
        │   ├── mmbench.sh
        │   ├── mme.sh
        │   ├── mmmu.sh
        │   ├── mmmu_test.sh
        │   ├── mmvet.sh
        │   └── textvqa.sh
        └── train
        │   ├── stage_1_2_full_yi34b_336_hr_768.sh
        │   └── stage_2_full_yi34b_672_hr_1536.sh
    ├── zero2.json
    ├── zero2_offload.json
    └── zero3.json


/cog.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration for Cog ⚙️
 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
 3 | 
 4 | build:
 5 |   gpu: true
 6 | 
 7 |   python_version: "3.11"
 8 | 
 9 |   python_packages:
10 |     - "torch==2.0.1"
11 |     - "accelerate==0.21.0"
12 |     - "bitsandbytes==0.41.0"
13 |     - "deepspeed==0.9.5"
14 |     - "einops-exts==0.0.4"
15 |     - "einops==0.6.1"
16 |     - "gradio==3.35.2"
17 |     - "gradio_client==0.2.9"
18 |     - "httpx==0.24.0"
19 |     - "markdown2==2.4.10"
20 |     - "numpy==1.26.0"
21 |     - "peft==0.4.0"
22 |     - "scikit-learn==1.2.2"
23 |     - "sentencepiece==0.1.99"
24 |     - "shortuuid==1.0.11"
25 |     - "timm==0.6.13"
26 |     - "tokenizers==0.13.3"
27 |     - "torch==2.0.1"
28 |     - "torchvision==0.15.2"
29 |     - "transformers==4.31.0"
30 |     - "wandb==0.15.12"
31 |     - "wavedrom==2.0.3.post3"
32 |     - "Pygments==2.16.1"
33 |   run:
34 |     - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 | 
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 | 


--------------------------------------------------------------------------------
/hostfile:
--------------------------------------------------------------------------------
1 | your_ip_0 slots=8
2 | your_ip_1 slots=8
3 | 


--------------------------------------------------------------------------------
/hostfile_4:
--------------------------------------------------------------------------------
1 | your_ip_0 slots=8
2 | your_ip_1 slots=8
3 | your_ip_2 slots=8
4 | your_ip_3 slots=8


--------------------------------------------------------------------------------
/images/demo_gen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/demo_gen.png


--------------------------------------------------------------------------------
/images/demo_und.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/demo_und.png


--------------------------------------------------------------------------------
/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/pipeline.png


--------------------------------------------------------------------------------
/images/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/images/teaser.png


--------------------------------------------------------------------------------
/mgm/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import MGMLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/mgm/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | PREDICT_TOKEN_INDEX = -300
10 | DEFAULT_IMAGE_TOKEN = "<image>"
11 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
12 | DEFAULT_IM_START_TOKEN = "<im_start>"
13 | DEFAULT_IM_END_TOKEN = "<im_end>"
14 | IMAGE_PLACEHOLDER = "<image-placeholder>"
15 | DEFAULT_PREDICT_TOKEN = "<predict>"
16 | 
17 | DESCRIPT_PROMPT = [
18 |     "Describe this image thoroughly.",
19 |     "Provide a detailed description in this picture.",
20 |     "Detail every aspect of what's in this picture.",
21 |     "Explain this image with precision and detail.",
22 |     "Give a comprehensive description of this visual.",
23 |     "Elaborate on the specifics within this image.",
24 |     "Offer a detailed account of this picture's contents.",
25 |     "Describe in detail what this image portrays.",
26 |     "Break down this image into detailed descriptions.",
27 |     "Provide a thorough description of the elements in this image."]


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/README.md:
--------------------------------------------------------------------------------
  1 | # Evaluation Guidelines
  2 | We provide detailed instructions for evaluation. 
  3 | To execute our evaluation script, please ensure that the structure of your model outputs is the same as ours.
  4 | 
  5 | We provide two options:
  6 | 1. Evaluation only: you can parse the response on your own and simply provide one file with all the final predictions.
  7 | 2. Parse and evaluation: you can leave all the responses to us with the output formats shown below.
  8 | 
  9 | ## Evaluation Only
 10 | If you want to use your own parsing logic and *only provide the final answer*, you can use `main_eval_only.py`.
 11 | 
 12 | You can provide all the outputs in *one file* in the following format:
 13 | 
 14 | ```
 15 | {
 16 |     "validation_Accounting_1": "D", # strictly "A", "B", "C", "D" for multi-choice question
 17 |     "validation_Architecture_and_Engineering_14": "0.0", # any string response for open question.
 18 |     ...
 19 | }
 20 | ```
 21 | Then run eval_only with:
 22 | ```
 23 | python main_eval_only.py --output_path ./example_outputs/llava1.5_13b/total_val_output.json
 24 | ```
 25 | 
 26 | Please refer to [example output](https://github.com/MMMU-Benchmark/MMMU/blob/main/eval/example_outputs/llava1.5_13b/total_val_output.json) for a detailed prediction file form.
 27 | 
 28 | 
 29 | ## Parse and Evaluation
 30 | You can also provide response and run the `main_parse_and_eval.py` to use our answer parsing processing and evaluation pipeline as follows:
 31 | 
 32 | ### Output folder structure
 33 | 
 34 | ```
 35 | └── model_name
 36 |     ├── category_name (e.g., Accounting)
 37 |     │   ├── output.json
 38 |     └── category_name (e.g., Electronics)
 39 |         ├── output.json
 40 |     ...
 41 | ```
 42 | 
 43 | ### Output file
 44 | Each `output.json`` has a list of dict containing instances for evaluation ().
 45 | ```
 46 | [
 47 |     {
 48 |         "id": "validation_Electronics_28",
 49 |         "question_type": "multiple-choice",
 50 |         "answer": "A", # given answer
 51 |         "all_choices": [ # create using `get_multi_choice_info` in 
 52 |             "A",
 53 |             "B",
 54 |             "C",
 55 |             "D"
 56 |         ],
 57 |         "index2ans": { # create using `get_multi_choice_info` in 
 58 |             "A": "75 + 13.3 cos(250t - 57.7°)V",
 59 |             "B": "75 + 23.3 cos(250t - 57.7°)V",
 60 |             "C": "45 + 3.3 cos(250t - 57.7°)V",
 61 |             "D": "95 + 13.3 cos(250t - 57.7°)V"
 62 |         },
 63 |         "response": "B" # model response
 64 |     },
 65 |     {
 66 |         "id": "validation_Electronics_29",
 67 |         "question_type": "short-answer",
 68 |         "answer": "30", # given answer
 69 |         "response": "36 watts" # model response
 70 |     },
 71 |     ...
 72 | ]
 73 | ```
 74 | 
 75 | ### Evaluation
 76 | ```
 77 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject ALL # all subject
 78 | 
 79 | # OR you can sepecify one subject for the evaluation
 80 | 
 81 | python main_parse_and_eval.py --path ./example_outputs/llava1.5_13b --subject elec # short name for Electronics. use --help for all short names
 82 | 
 83 | ```
 84 | 
 85 | `main_parse_and_eval.py` will generate `parsed_output.json` and `result.json` in the subfolder under the same category with output.json, respectively.
 86 | 
 87 | ```
 88 | ├── Accounting
 89 | │   ├── output.json
 90 | │   ├── parsed_output.json
 91 | │   └── result.json
 92 | └── Electronics
 93 |     ├── output.json
 94 |     ├── parsed_output.json
 95 |     └── result.json
 96 | ...
 97 | ```
 98 | 
 99 | ### Print Results
100 | You can print results locally if you want. (use `pip install tabulate` if you haven't)
101 | ```
102 | python print_results.py --path ./example_outputs/llava1.5_13b
103 | # Results may be slightly different due to the ramdon selection for fail response
104 | ```
105 | 
106 | 
107 | 
108 | ##### Run Llava
109 | In case if you want to reproduce the results of some of the models, please go check run_llava.py as an example.
110 | 
111 | By seeting up the env following the [llava official repo](https://github.com/haotian-liu/LLaVA) and installing `datasets` packages by huggingface, you can run llava viathe following command:
112 | 
113 | ```
114 | CUDA_VISIBLE_DEVICES=0 nohup python run_llava.py \
115 | --output_path example_outputs/llava1.5_13b_val.json \
116 | --model_path liuhaotian/llava-v1.5-13b \
117 | --config_path configs/llava1.5.yaml
118 | ```
119 | 
120 | Then you can evaluate the results via the very first pipeline.
121 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/configs/llava1.5.yaml:
--------------------------------------------------------------------------------
 1 | task_instructions:
 2 | - ""
 3 | multi_choice_example_format:
 4 | - "{}
 5 | 
 6 | {}
 7 | 
 8 | Answer with the option's letter from the given choices directly."
 9 | 
10 | short_ans_example_format:
11 | - "{}
12 | 
13 | Answer the question using a single word or phrase."
14 | temperature:
15 | - 0


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/convert_to_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from argparse import ArgumentParser
 4 | 
 5 | from utils.eval_utils import evaluate
 6 | from utils.data_utils import save_json
 7 | 
 8 | 
 9 | def main():
10 |     parser = ArgumentParser()
11 |     parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 |                         help='name of saved json')
13 |     parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 |                         help='name of saved json')
15 | 
16 |     args = parser.parse_args()
17 |     out_samples = [json.loads(line) for line in open(args.result_file)]
18 |     out_json = {}
19 |     for _sample in out_samples:
20 |         _result = _sample['parsed_pred']
21 |         if isinstance(_result, list):
22 |             _result = str(_result[0])
23 |         out_json[_sample['id']] = _result
24 |     
25 |     save_json(args.output_path, out_json)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from argparse import ArgumentParser
 4 | 
 5 | from utils.eval_utils import evaluate
 6 | from utils.data_utils import save_json
 7 | 
 8 | 
 9 | def main():
10 |     parser = ArgumentParser()
11 |     parser.add_argument('--result_file', type=str, default='llava1.5_13b_val.txt',
12 |                         help='name of saved json')
13 |     parser.add_argument('--output_path', type=str, default='llava1.5_13b_val.json',
14 |                         help='name of saved json')
15 | 
16 |     args = parser.parse_args()
17 |     out_samples = [json.loads(line) for line in open(args.result_file)]
18 |     
19 |     judge_dict, metric_dict = evaluate(out_samples)
20 |     metric_dict.update({"num_example": len(out_samples)})
21 |     judge_dict['metric_dict'] = metric_dict
22 |     save_dir = '/'.join(args.output_path.split('/')[:-1])
23 |     if not os.path.exists(save_dir):
24 |         os.makedirs(save_dir)
25 |     save_json(args.output_path, judge_dict)
26 | 
27 |     print(metric_dict)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/main_eval_only.py:
--------------------------------------------------------------------------------
 1 | """Parse and Evalate"""
 2 | import os
 3 | import json
 4 | 
 5 | import pdb
 6 | from argparse import ArgumentParser
 7 | 
 8 | from utils.data_utils import save_json, CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
 9 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response, calculate_ins_level_acc
10 | 
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = ArgumentParser()
15 |     parser.add_argument('--output_path', type=str, default="./example_outputs/qwen_vl/total_val_output.json", help="The path to model output file.")
16 |     parser.add_argument('--answer_path', type=str, default="./answer_dict_val.json", help="Answer file path.")
17 |     args = parser.parse_args()
18 | 
19 |     output_dict = json.load(open(args.output_path))
20 |     answer_dict = json.load(open(args.answer_path))
21 | 
22 |     # group by category
23 |     output_dict_w_cat = {}
24 |     for data_id, parsed_pred in output_dict.items():
25 |         category = "_".join(data_id.split("_")[1:-1])
26 |         if category not in output_dict_w_cat:
27 |             output_dict_w_cat.update({category: {}})
28 |         output_dict_w_cat[category].update({data_id: parsed_pred})
29 | 
30 |     # group by category
31 |     answer_dict_w_cat = {}
32 |     for data_id, parsed_pred in answer_dict.items():
33 |         category = "_".join(data_id.split("_")[1:-1])
34 |         if category not in answer_dict_w_cat:
35 |             answer_dict_w_cat.update({category: {}})
36 |         answer_dict_w_cat[category].update({data_id: parsed_pred})
37 | 
38 |     evaluation_result = {}
39 | 
40 |     for category in CAT_SHORT2LONG.values():
41 |         print("Evaluating: {}".format(category))
42 |         # get cat_outputs and cat_answers
43 |         try:
44 |             cat_outputs = output_dict_w_cat[category]
45 |             cat_answers = answer_dict_w_cat[category]
46 |         except KeyError:
47 |             print("Skipping {} for not found".format(category))
48 |             continue
49 |         
50 |         exampels_to_eval = []
51 |         for data_id, parsed_pred in cat_outputs.items():
52 |             question_type = cat_answers[data_id]['question_type']
53 |             if question_type != 'multiple-choice':
54 |                 parsed_pred = parse_open_response(parsed_pred) # mainly for type consistency (make it number, etc.)
55 |             else:
56 |                 parsed_pred = parsed_pred
57 | 
58 |             exampels_to_eval.append({
59 |                 "id": data_id,
60 |                 "question_type": question_type,
61 |                 "answer": cat_answers[data_id]['ground_truth'],
62 |                 "parsed_pred": parsed_pred
63 |             })
64 | 
65 |         judge_dict, metric_dict = evaluate(exampels_to_eval)
66 |         metric_dict.update({"num_example": len(exampels_to_eval)})
67 | 
68 |         evaluation_result[category] = metric_dict
69 | 
70 |     printable_results = {}
71 |     # pdb.set_trace()
72 |     # add domain Subject
73 |     for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
74 |         in_domain_cat_results = {}
75 |         for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
76 |             if cat_name in evaluation_result.keys():
77 |                 in_domain_cat_results[cat_name] = evaluation_result[cat_name]
78 |             else:
79 |                 pass
80 |         in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
81 |         in_domain_data_num = sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
82 |         printable_results['Overall-' + domain] = {"num": int(in_domain_data_num),
83 |                                                   "acc": round(in_domain_ins_acc, 3)
84 |                                                   }
85 |         # add sub category
86 |         for cat_name, cat_results in in_domain_cat_results.items():
87 |             printable_results[cat_name] = {"num": int(cat_results['num_example']),
88 |                                            "acc": round(cat_results['acc'], 3)
89 |                                            }
90 |         
91 |     # table.append(["-----------------------------", "-----", "----"])
92 |     all_ins_acc = calculate_ins_level_acc(evaluation_result)
93 |     printable_results['Overall'] = {"num": sum([cat_results['num_example'] for cat_results in evaluation_result.values()]),
94 |                                     "acc": round(all_ins_acc, 3)
95 |                                     }
96 | 
97 |     print(printable_results)
98 | 
99 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/main_parse_and_eval.py:
--------------------------------------------------------------------------------
 1 | """Parse and Evalate"""
 2 | import os
 3 | import json
 4 | from argparse import ArgumentParser
 5 | 
 6 | from utils.data_utils import save_json, CAT_SHORT2LONG
 7 | from utils.eval_utils import evaluate, parse_multi_choice_response, parse_open_response
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     parser = ArgumentParser()
13 |     parser.add_argument('--path', type=str, default="./example_outputs/llava1.5_13b", help="The path to model output directory.")
14 |     parser.add_argument('--subject', nargs='+',
15 |                         help=f'The name of the mmmu sub-category. Availble: {CAT_SHORT2LONG.keys()} or ALL')
16 | 
17 |     args = parser.parse_args()
18 |     if args.subject[0] == 'ALL':
19 |         args.subject = CAT_SHORT2LONG.keys()
20 |     
21 |     ex_output_path = os.path.join(args.path)
22 | 
23 |     all_results = {}
24 |     for cat_short in args.subject:
25 |         category = CAT_SHORT2LONG[cat_short]
26 |         print("Evaluating: {}".format(category))
27 |         if category not in os.listdir(ex_output_path):
28 |             print("Skipping {} for not found".format(category))
29 |         else:
30 |             cat_folder_path = os.path.join(ex_output_path, category)
31 |             cat_outputs = json.load(open(os.path.join(cat_folder_path, 'output.json')))
32 |             # Evaluation
33 |             eval_samples = []
34 |             for cat_output in cat_outputs:
35 |                 response = cat_output['response']
36 |                 if cat_output['question_type'] == 'multiple-choice':                    
37 |                     all_choices = cat_output['all_choices']
38 |                     index2ans = cat_output['index2ans']
39 |                     parsed_pred = parse_multi_choice_response(response, all_choices, index2ans)
40 |                     eval_samples.append(
41 |                         {
42 |                             'id': cat_output['id'],
43 |                             'question_type': cat_output['question_type'],
44 |                             'answer': cat_output['answer'], # the content in option, not answer index.
45 |                             'response': response,
46 |                             'parsed_pred': parsed_pred,
47 |                             'index2ans': index2ans,
48 |                         }
49 |                     )
50 |                 else:  # open
51 |                     parsed_pred = parse_open_response(response)
52 |                     eval_samples.append(
53 |                         {
54 |                             'id': cat_output['id'],
55 |                             'question_type': cat_output['question_type'],
56 |                             'answer': cat_output['answer'],
57 |                             'response': response,
58 |                             'parsed_pred': parsed_pred,
59 |                         }
60 |                     )
61 | 
62 |             print("Num of valid samples: {}, Expected Num: {}".format(len(eval_samples), len(cat_outputs)))
63 |             
64 |             judge_dict, metric_dict = evaluate(eval_samples)
65 |             metric_dict.update({"num_example": len(eval_samples)})
66 |             for eval_sample in eval_samples:
67 |                 eval_sample.update({"judge": judge_dict[eval_sample['id']]})
68 | 
69 |             save_json(os.path.join(cat_folder_path, 'parsed_output.json'), eval_samples)
70 |             save_json(os.path.join(cat_folder_path, 'result.json'), metric_dict)
71 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/print_results.py:
--------------------------------------------------------------------------------
 1 | # Beautiful table to print results of all categories
 2 | 
 3 | import os
 4 | from typing import Dict
 5 | import json
 6 | import numpy as np
 7 | from tabulate import tabulate
 8 | 
 9 | from argparse import ArgumentParser
10 | 
11 | from utils.data_utils import CAT_SHORT2LONG, DOMAIN_CAT2SUB_CAT
12 | 
13 | from utils.eval_utils import calculate_ins_level_acc
14 | 
15 | def main():
16 |     parser = ArgumentParser()
17 |     parser.add_argument('--path', type=str, default="./example_outputs/blip2_flant5xxl", help="The path to output directory.")
18 |     args = parser.parse_args()
19 | 
20 |     # load all results
21 |     all_results = {}
22 |     for cat_folder_name in os.listdir(args.path):
23 |         if cat_folder_name in CAT_SHORT2LONG.values():
24 |             cat_folder_path = os.path.join(args.path, cat_folder_name)
25 |             result_path = os.path.join(cat_folder_path, 'result.json')
26 |             if os.path.exists(result_path):
27 |                 cat_results = json.load(open(result_path))
28 |                 all_results[cat_folder_name] = cat_results
29 | 
30 |     # print results
31 |     headers = ['Subject', 'Data Num', 'Acc']
32 |     table = []
33 | 
34 |     # add domain Subject
35 |     for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
36 |         in_domain_cat_results = {}
37 |         for cat_name in in_domain_cats: # use the order in DOMAIN_CAT2SUB_CAT
38 |             if cat_name in all_results.keys():
39 |                 in_domain_cat_results[cat_name] = all_results[cat_name]
40 |             else:
41 |                 pass
42 |         in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
43 |         in_domain_data_num = np.sum([cat_results['num_example'] for cat_results in in_domain_cat_results.values()])
44 |         table.append(['Overall-' + domain, int(in_domain_data_num), round(in_domain_ins_acc, 3)])
45 |         # add sub category
46 |         for cat_name, cat_results in in_domain_cat_results.items():
47 |             table.append([cat_name, int(cat_results['num_example']), round(cat_results['acc'], 3)])
48 |         # table.append(["-----------------------------", "-----", "----"])
49 | 
50 |     # table.append(["-----------------------------", "-----", "----"])
51 |     all_ins_acc = calculate_ins_level_acc(all_results)
52 |     table.append(['Overall', np.sum([cat_results['num_example'] for cat_results in all_results.values()]), round(all_ins_acc, 3)])
53 | 
54 |     print(tabulate(table, headers=headers, tablefmt='orgtbl'))
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/eval/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | import torch
 3 | 
 4 | def call_llava_engine_df(args, sample, model, tokenizer=None, processor=None):
 5 |     from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 6 |     from mgm.conversation import conv_templates, SeparatorStyle
 7 | 
 8 |     def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 9 |         prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
10 | 
11 |         def insert_separator(X, sep):
12 |             return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
13 | 
14 |         input_ids = []
15 |         offset = 0
16 |         if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
17 |             offset = 1
18 |             input_ids.append(prompt_chunks[0][0])
19 | 
20 |         for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
21 |             input_ids.extend(x[offset:])
22 | 
23 |         if return_tensors is not None:
24 |             if return_tensors == 'pt':
25 |                 return torch.tensor(input_ids, dtype=torch.long)
26 |             raise ValueError(f'Unsupported tensor type: {return_tensors}')
27 |         return input_ids
28 | 
29 |     def deal_with_prompt(input_text, mm_use_im_start_end, ocr_tokens):
30 |         if ocr_tokens is not None:
31 |             qs = input_text + '\n' + ocr_tokens
32 |         else:
33 |             qs = input_text
34 |         if mm_use_im_start_end:
35 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
36 |         else:
37 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
38 |         return qs
39 | 
40 |     prompt = sample['final_input_prompt']
41 |     ocr_tokens = sample.get('ocr', None)
42 |     prompt = deal_with_prompt(prompt, model.config.mm_use_im_start_end, ocr_tokens)
43 |     conv = conv_templates[args.conv_mode].copy()
44 |     conv.append_message(conv.roles[0], prompt)
45 |     conv.append_message(conv.roles[1], None)
46 |     prompt = conv.get_prompt()
47 |     input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
48 |     image = sample['image']
49 |     image_aux = sample['image_aux']
50 |     if image_aux is not None:
51 |         image_aux = image_aux.unsqueeze(0).half().cuda()
52 |     
53 |     terminators = tokenizer.eos_token_id
54 |     if "llama_3" in args.conv_mode:
55 |         terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
56 |     
57 |     if image is not None:
58 |         output_ids = model.generate(
59 |             input_ids,
60 |             images=image.unsqueeze(0).half().cuda(),
61 |             images_aux=image_aux,
62 |             do_sample=True,
63 |             temperature=1,
64 |             num_beams=5,
65 |             top_p=None,
66 |             max_new_tokens=128,
67 |             bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
68 |             eos_token_id=terminators,  # End of sequence token
69 |             pad_token_id=tokenizer.pad_token_id,  # Pad token
70 |             use_cache=True)
71 | 
72 |         response = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip('\n')
73 |     else:  # multiple images actually
74 |         if sample['question_type'] == 'multiple-choice':
75 |             all_choices = sample['all_choices']
76 |             response = random.choice(all_choices)
77 |         else:
78 |             response = 'INVALID GENERATION FOR MULTIPLE IMAGE INPUTS'
79 | 
80 |     return response
81 | 


--------------------------------------------------------------------------------
/mgm/eval/MMMU/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/MMMU/image.png


--------------------------------------------------------------------------------
/mgm/eval/MathVista/extract_answer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | import argparse
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | import sys
  9 | sys.path.append('../')
 10 | from utilities import *
 11 | 
 12 | # OpenAI
 13 | import openai
 14 | 
 15 | # load demo prompt
 16 | from prompts.ext_ans import demo_prompt
 17 | 
 18 | 
 19 | def verify_extraction(extraction):
 20 |     extraction = extraction.strip()
 21 |     if extraction == "" or extraction == None:
 22 |         return False
 23 |     return True
 24 | 
 25 | 
 26 | def create_test_prompt(demo_prompt, query, response):
 27 |     demo_prompt = demo_prompt.strip()
 28 |     test_prompt = f"{query}\n\n{response}"
 29 |     full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
 30 |     return full_prompt
 31 | 
 32 | 
 33 | def extract_answer(response, problem, quick_extract=False):
 34 |     question_type = problem['question_type']
 35 |     answer_type = problem['answer_type']
 36 |     choices = problem['choices']
 37 |     query = problem['query']
 38 |     pid = problem['pid']
 39 | 
 40 |     if response == "":
 41 |         return ""
 42 |     
 43 |     if question_type == 'multi_choice' and response in choices:
 44 |         return response
 45 |     
 46 |     if answer_type == "integer":
 47 |         try:
 48 |             extraction = int(response)
 49 |             return str(extraction)
 50 |         except:
 51 |             pass
 52 | 
 53 |     if answer_type == "float":
 54 |         try:
 55 |             extraction = str(float(response))
 56 |             return extraction
 57 |         except:
 58 |             pass
 59 | 
 60 |     # quick extraction
 61 |     if quick_extract:
 62 |         print("Quickly extracting answer...")
 63 |         # The answer is "text". -> "text"
 64 |         try:
 65 |             result = re.search(r'The answer is "(.*)"\.', response)
 66 |             if result:
 67 |                 extraction = result.group(1)
 68 |                 return extraction
 69 |         except:
 70 |             pass
 71 | 
 72 |     # general extraction
 73 |     try:
 74 |         full_prompt = create_test_prompt(demo_prompt, query, response)
 75 |         extraction = get_chat_response(full_prompt, openai.api_key, openai.api_base, model=args.llm_engine)
 76 |         return extraction
 77 |     except Exception as e:
 78 |         print(e)
 79 |         print(f"Error in extracting answer for {pid}")
 80 | 
 81 |     return ""
 82 | 
 83 | 
 84 | if __name__ == '__main__':
 85 |     parser = argparse.ArgumentParser()
 86 |     # input
 87 |     parser.add_argument('--output_file', type=str, default='answer.json')
 88 |     parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
 89 |     # model
 90 |     parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
 91 |                         choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
 92 |     parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
 93 |     parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
 94 |     parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
 95 |     # openai
 96 |     parser.add_argument("--api_key", required=True, type=str, help="OpenAI API key")
 97 |     parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base")
 98 |     # output
 99 |     parser.add_argument('--save_every', type=int, default=10, help='save every n problems')
100 |     parser.add_argument('--output_label', type=str, default='', help='label for the output file')
101 |     args = parser.parse_args()
102 | 
103 |     # args
104 |     label = args.response_label
105 |     result_file = args.output_file
106 |     if args.output_label != '':
107 |         output_file = result_file.replace('.json', f'_{args.output_label}.json')
108 |     else:
109 |         output_file = result_file
110 | 
111 |     # read results
112 |     print(f"Reading {result_file}...")
113 |     try:
114 |         results = read_json(output_file)
115 |     except:
116 |         samples = [json.loads(line) for line in open(result_file)]
117 |         results = {}
118 |         for sample in samples:
119 |             results[sample['pid']] = sample
120 | 
121 |     # full pids
122 |     full_pids = list(results.keys())
123 |     if args.number > 0:
124 |         full_pids = full_pids[:min(args.number, len(full_pids))]
125 |     print("Number of testing problems:", len(full_pids))
126 | 
127 |     # test pids
128 |     if args.rerun:
129 |         test_pids = full_pids
130 |     else:
131 |         test_pids = []
132 |         for pid in full_pids:
133 |             # print(pid)
134 |             if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
135 |                 test_pids.append(pid)
136 |     
137 |     test_num = len(test_pids)
138 |     print("Number of problems to run:", test_num)
139 |     # print(test_pids)
140 | 
141 |     # openai api
142 |     openai.api_key = args.api_key # Your API key here
143 |     if args.api_base:
144 |         openai.api_base = args.api_base # Your API base here
145 | 
146 |     # tqdm, enumerate results
147 |     for i, pid in enumerate(tqdm(test_pids)):
148 |         problem = results[pid]
149 | 
150 |         assert label in problem
151 |         response = problem[label]       
152 | 
153 |         
154 |         extraction  = extract_answer(response, problem, args.quick_extract)
155 |         results[pid]['extraction'] = extraction
156 | 
157 |         if i % args.save_every == 0 or i == test_num - 1:
158 |             print(f"Saving results to {output_file}...")
159 |             save_json(results, output_file)
160 |             print(f"Results saved.")
161 | 


--------------------------------------------------------------------------------
/mgm/eval/MathVista/prompts/ext_ans.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # pids = 852,  104,  824,  506,  540
 4 | 
 5 | demo_prompt = """
 6 | Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
 7 | 
 8 | Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
 9 | Question: Which number is missing?
10 | 
11 | Model response: The number missing in the sequence is 14.
12 | 
13 | Extracted answer: 14
14 | 
15 | Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
16 | Question: What is the fraction of females facing the camera?
17 | 
18 | Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
19 | 
20 | Extracted answer: 0.6
21 | 
22 | Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
23 | Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
24 | 
25 | Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
26 | 
27 | Extracted answer: 1.45
28 | 
29 | Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
30 | Question: Between which two years does the line  graph saw its maximum peak?
31 | 
32 | Model response: The line graph saw its maximum peak between 2007 and 2008.
33 | 
34 | Extracted answer: [2007, 2008]
35 | 
36 | Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
37 | Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
38 | 
39 | Model response: The correct answer is (B) 8/11.
40 | 
41 | Extracted answer: B
42 | """


--------------------------------------------------------------------------------
/mgm/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import tqdm
  7 | import ray
  8 | import time
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | @ray.remote(num_cpus=4)
 13 | def get_eval(content: str, max_tokens: int):
 14 |     while True:
 15 |         try:
 16 |             response = openai.ChatCompletion.create(
 17 |                 model='gpt-4',
 18 |                 messages=[{
 19 |                     'role': 'system',
 20 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 21 |                 }, {
 22 |                     'role': 'user',
 23 |                     'content': content,
 24 |                 }],
 25 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 26 |                 max_tokens=max_tokens,
 27 |             )
 28 |             break
 29 |         except openai.error.RateLimitError:
 30 |             pass
 31 |         except Exception as e:
 32 |             print(e)
 33 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 34 | 
 35 |     print('success!')
 36 |     return response['choices'][0]['message']['content']
 37 | 
 38 | 
 39 | def parse_score(review):
 40 |     try:
 41 |         score_pair = review.split('\n')[0]
 42 |         score_pair = score_pair.replace(',', ' ')
 43 |         sp = score_pair.split(' ')
 44 |         if len(sp) == 2:
 45 |             return [float(sp[0]), float(sp[1])]
 46 |         else:
 47 |             print('error', review)
 48 |             return [-1, -1]
 49 |     except Exception as e:
 50 |         print(e)
 51 |         print('error', review)
 52 |         return [-1, -1]
 53 | 
 54 | 
 55 | if __name__ == '__main__':
 56 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 57 |     parser.add_argument('-q', '--question')
 58 |     # parser.add_argument('-a', '--answer')
 59 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 60 |     parser.add_argument('-r', '--rule')
 61 |     parser.add_argument('-o', '--output')
 62 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 63 |     args = parser.parse_args()
 64 | 
 65 |     ray.init()
 66 | 
 67 |     f_q = open(os.path.expanduser(args.question))
 68 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 69 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 70 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 71 | 
 72 |     review_file = open(f'{args.output}', 'w')
 73 | 
 74 |     js_list = []
 75 |     handles = []
 76 |     idx = 0
 77 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 78 |         # if idx == 1:
 79 |         #     break
 80 | 
 81 |         ques = json.loads(ques_js)
 82 |         ans1 = json.loads(ans1_js)
 83 |         ans2 = json.loads(ans2_js)
 84 | 
 85 |         category = json.loads(ques_js)['category']
 86 |         if category in rule_dict:
 87 |             rule = rule_dict[category]
 88 |         else:
 89 |             rule = rule_dict['default']
 90 |         prompt = rule['prompt']
 91 |         role = rule['role']
 92 |         content = (f'[Question]\n{ques["text"]}\n\n'
 93 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 94 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 95 |                    f'[System]\n{prompt}\n\n')
 96 |         js_list.append({
 97 |             'id': idx+1,
 98 |             'question_id': ques['question_id'],
 99 |             'answer1_id': ans1['answer_id'],
100 |             'answer2_id': ans2['answer_id'],
101 |             'category': category})
102 |         idx += 1
103 |         handles.append(get_eval.remote(content, args.max_tokens))
104 |         # To avoid the rate limit set by OpenAI
105 |         time.sleep(NUM_SECONDS_TO_SLEEP)
106 | 
107 |     reviews = ray.get(handles)
108 |     for idx, review in enumerate(reviews):
109 |         scores = parse_score(review)
110 |         js_list[idx]['content'] = review
111 |         js_list[idx]['tuple'] = scores
112 |         review_file.write(json.dumps(js_list[idx]) + '\n')
113 |     review_file.close()
114 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_gpt_review_bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 | 
 86 |         if isinstance(inst['caption'], list):
 87 |             cap_str = '\n'.join(inst['caption'])
 88 |         else:
 89 |             cap_str = inst['caption']
 90 | 
 91 |         category = 'llava_bench_' + json.loads(ques_js)['category']
 92 |         if category in rule_dict:
 93 |             rule = rule_dict[category]
 94 |         else:
 95 |             assert False, f"Visual QA category not found in rule file: {category}."
 96 |         prompt = rule['prompt']
 97 |         role = rule['role']
 98 |         content = (f'[Context]\n{cap_str}\n\n'
 99 |                    f'[Question]\n{ques["text"]}\n\n'
100 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102 |                    f'[System]\n{prompt}\n\n')
103 |         cur_js = {
104 |             'id': idx+1,
105 |             'question_id': ques['question_id'],
106 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
107 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108 |             'category': category
109 |         }
110 |         if idx >= len(cur_reviews):
111 |             review = get_eval(content, args.max_tokens)
112 |             scores = parse_score(review)
113 |             cur_js['content'] = review
114 |             cur_js['tuple'] = scores
115 |             review_file.write(json.dumps(cur_js) + '\n')
116 |             review_file.flush()
117 |         else:
118 |             print(f'Skipping {idx} as we already have it.')
119 |         idx += 1
120 |         print(idx)
121 |     review_file.close()
122 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import openai
  6 | import time
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
 54 |     parser.add_argument('-q', '--question')
 55 |     parser.add_argument('-c', '--context')
 56 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 57 |     parser.add_argument('-r', '--rule')
 58 |     parser.add_argument('-o', '--output')
 59 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
 60 |     args = parser.parse_args()
 61 | 
 62 |     f_q = open(os.path.expanduser(args.question))
 63 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 64 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 65 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 66 | 
 67 |     if os.path.isfile(os.path.expanduser(args.output)):
 68 |         cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
 69 |     else:
 70 |         cur_reviews = []
 71 | 
 72 |     review_file = open(f'{args.output}', 'a')
 73 | 
 74 |     context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
 75 |     image_to_context = {context['image']: context for context in context_list}
 76 | 
 77 |     handles = []
 78 |     idx = 0
 79 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 80 |         ques = json.loads(ques_js)
 81 |         ans1 = json.loads(ans1_js)
 82 |         ans2 = json.loads(ans2_js)
 83 | 
 84 |         inst = image_to_context[ques['image']]
 85 |         cap_str = '\n'.join(inst['captions'])
 86 |         box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             assert False, f"Visual QA category not found in rule file: {category}."
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
 96 |                    f'[Question]\n{ques["text"]}\n\n'
 97 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 98 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 99 |                    f'[System]\n{prompt}\n\n')
100 |         cur_js = {
101 |             'id': idx+1,
102 |             'question_id': ques['question_id'],
103 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
104 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105 |             'category': category
106 |         }
107 |         if idx >= len(cur_reviews):
108 |             review = get_eval(content, args.max_tokens)
109 |             scores = parse_score(review)
110 |             cur_js['content'] = review
111 |             cur_js['tuple'] = scores
112 |             review_file.write(json.dumps(cur_js) + '\n')
113 |             review_file.flush()
114 |         else:
115 |             print(f'Skipping {idx} as we already have it.')
116 |         idx += 1
117 |         print(idx)
118 |     review_file.close()
119 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | def eval_pope(answers, label_file):
 6 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 7 | 
 8 |     for answer in answers:
 9 |         text = answer['text']
10 | 
11 |         # Only keep the first sentence
12 |         if text.find('.') != -1:
13 |             text = text.split('.')[0]
14 | 
15 |         text = text.replace(',', '')
16 |         words = text.split(' ')
17 |         if 'No' in words or 'not' in words or 'no' in words:
18 |             answer['text'] = 'no'
19 |         else:
20 |             answer['text'] = 'yes'
21 | 
22 |     for i in range(len(label_list)):
23 |         if label_list[i] == 'no':
24 |             label_list[i] = 0
25 |         else:
26 |             label_list[i] = 1
27 | 
28 |     pred_list = []
29 |     for answer in answers:
30 |         if answer['text'] == 'no':
31 |             pred_list.append(0)
32 |         else:
33 |             pred_list.append(1)
34 | 
35 |     pos = 1
36 |     neg = 0
37 |     yes_ratio = pred_list.count(1) / len(pred_list)
38 | 
39 |     TP, TN, FP, FN = 0, 0, 0, 0
40 |     for pred, label in zip(pred_list, label_list):
41 |         if pred == pos and label == pos:
42 |             TP += 1
43 |         elif pred == pos and label == neg:
44 |             FP += 1
45 |         elif pred == neg and label == neg:
46 |             TN += 1
47 |         elif pred == neg and label == pos:
48 |             FN += 1
49 | 
50 |     print('TP\tFP\tTN\tFN\t')
51 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52 | 
53 |     precision = float(TP) / float(TP + FP)
54 |     recall = float(TP) / float(TP + FN)
55 |     f1 = 2*precision*recall / (precision + recall)
56 |     acc = (TP + TN) / (TP + TN + FP + FN)
57 |     print('Accuracy: {}'.format(acc))
58 |     print('Precision: {}'.format(precision))
59 |     print('Recall: {}'.format(recall))
60 |     print('F1 score: {}'.format(f1))
61 |     print('Yes ratio: {}'.format(yes_ratio))
62 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63 | 
64 | if __name__ == "__main__":
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument("--annotation-dir", type=str)
67 |     parser.add_argument("--question-file", type=str)
68 |     parser.add_argument("--result-file", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     questions = [json.loads(line) for line in open(args.question_file)]
72 |     questions = {question['question_id']: question for question in questions}
73 |     answers = [json.loads(q) for q in open(args.result_file)]
74 |     for file in os.listdir(args.annotation_dir):
75 |         assert file.startswith('coco_pope_')
76 |         assert file.endswith('.json')
77 |         category = file[10:-5]
78 |         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81 |         print("====================================")
82 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/mgm/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from mgm.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/mgm/eval/generate_webpage_data_from_table.py:
--------------------------------------------------------------------------------
  1 | """Generate json file for webpage."""
  2 | import json
  3 | import os
  4 | import re
  5 | 
  6 | # models = ['llama', 'alpaca', 'gpt35', 'bard']
  7 | models = ['vicuna']
  8 | 
  9 | 
 10 | def read_jsonl(path: str, key: str=None):
 11 |     data = []
 12 |     with open(os.path.expanduser(path)) as f:
 13 |         for line in f:
 14 |             if not line:
 15 |                 continue
 16 |             data.append(json.loads(line))
 17 |     if key is not None:
 18 |         data.sort(key=lambda x: x[key])
 19 |         data = {item[key]: item for item in data}
 20 |     return data
 21 | 
 22 | 
 23 | def trim_hanging_lines(s: str, n: int) -> str:
 24 |     s = s.strip()
 25 |     for _ in range(n):
 26 |         s = s.split('\n', 1)[1].strip()
 27 |     return s
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 |     questions = read_jsonl('table/question.jsonl', key='question_id')
 32 | 
 33 |     # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
 34 |     # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
 35 |     # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
 36 |     # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
 37 |     vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
 38 |     ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
 39 | 
 40 |     review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
 41 |     # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
 42 |     # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
 43 |     # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
 44 |     # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
 45 | 
 46 |     records = []
 47 |     for qid in questions.keys():
 48 |         r = {
 49 |             'id': qid,
 50 |             'category': questions[qid]['category'],
 51 |             'question': questions[qid]['text'],
 52 |             'answers': {
 53 |                 # 'alpaca': alpaca_answers[qid]['text'],
 54 |                 # 'llama': llama_answers[qid]['text'],
 55 |                 # 'bard': bard_answers[qid]['text'],
 56 |                 # 'gpt35': gpt35_answers[qid]['text'],
 57 |                 'vicuna': vicuna_answers[qid]['text'],
 58 |                 'ours': ours_answers[qid]['text'],
 59 |             },
 60 |             'evaluations': {
 61 |                 # 'alpaca': review_alpaca[qid]['text'],
 62 |                 # 'llama': review_llama[qid]['text'],
 63 |                 # 'bard': review_bard[qid]['text'],
 64 |                 'vicuna': review_vicuna[qid]['content'],
 65 |                 # 'gpt35': review_gpt35[qid]['text'],
 66 |             },
 67 |             'scores': {
 68 |                 'vicuna': review_vicuna[qid]['tuple'],
 69 |                 # 'alpaca': review_alpaca[qid]['score'],
 70 |                 # 'llama': review_llama[qid]['score'],
 71 |                 # 'bard': review_bard[qid]['score'],
 72 |                 # 'gpt35': review_gpt35[qid]['score'],
 73 |             },
 74 |         }
 75 | 
 76 |         # cleanup data
 77 |         cleaned_evals = {}
 78 |         for k, v in r['evaluations'].items():
 79 |             v = v.strip()
 80 |             lines = v.split('\n')
 81 |             # trim the first line if it's a pair of numbers
 82 |             if re.match(r'\d+[, ]+\d+', lines[0]):
 83 |                 lines = lines[1:]
 84 |             v = '\n'.join(lines)
 85 |             cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
 86 | 
 87 |         r['evaluations'] = cleaned_evals
 88 |         records.append(r)
 89 | 
 90 |     # Reorder the records, this is optional
 91 |     for r in records:
 92 |         if r['id'] <= 20:
 93 |             r['id'] += 60
 94 |         else:
 95 |             r['id'] -= 20
 96 |     for r in records:
 97 |         if r['id'] <= 50:
 98 |             r['id'] += 10
 99 |         elif 50 < r['id'] <= 60:
100 |             r['id'] -= 50
101 |     for r in records:
102 |         if r['id'] == 7:
103 |             r['id'] = 1
104 |         elif r['id'] < 7:
105 |             r['id'] += 1 
106 | 
107 |     records.sort(key=lambda x: x['id'])
108 | 
109 |     # Write to file
110 |     with open('webpage/data.json', 'w') as f:
111 |         json.dump({'questions': records, 'models': models}, f, indent=2)
112 | 


--------------------------------------------------------------------------------
/mgm/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
 3 | import torch
 4 | import os
 5 | import json
 6 | from tqdm import tqdm
 7 | import shortuuid
 8 | 
 9 | from mgm.conversation import default_conversation
10 | from mgm.utils import disable_torch_init
11 | 
12 | 
13 | @torch.inference_mode()
14 | def eval_model(model_name, questions_file, answers_file):
15 |     # Model
16 |     disable_torch_init()
17 |     model_name = os.path.expanduser(model_name)
18 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19 |     model = AutoModelForCausalLM.from_pretrained(model_name,
20 |         torch_dtype=torch.float16).cuda()
21 | 
22 | 
23 |     ques_file = open(os.path.expanduser(questions_file), "r")
24 |     ans_file = open(os.path.expanduser(answers_file), "w")
25 |     for i, line in enumerate(tqdm(ques_file)):
26 |         idx = json.loads(line)["question_id"]
27 |         qs = json.loads(line)["text"]
28 |         cat = json.loads(line)["category"]
29 |         conv = default_conversation.copy()
30 |         conv.append_message(conv.roles[0], qs)
31 |         prompt = conv.get_prompt()
32 |         inputs = tokenizer([prompt])
33 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
34 |         output_ids = model.generate(
35 |             input_ids,
36 |             do_sample=True,
37 |             use_cache=True,
38 |             temperature=0.7,
39 |             max_new_tokens=1024,)
40 |         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41 |         try:
42 |             index = outputs.index(conv.sep, len(prompt))
43 |         except ValueError:
44 |             outputs += conv.sep
45 |             index = outputs.index(conv.sep, len(prompt))
46 | 
47 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48 |         ans_id = shortuuid.uuid()
49 |         ans_file.write(json.dumps({"question_id": idx,
50 |                                    "text": outputs,
51 |                                    "answer_id": ans_id,
52 |                                    "model_id": model_name,
53 |                                    "metadata": {}}) + "\n")
54 |         ans_file.flush()
55 |     ans_file.close()
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60 |     parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62 |     args = parser.parse_args()
63 | 
64 |     eval_model(args.model_name, args.question_file, args.answers_file)


--------------------------------------------------------------------------------
/mgm/eval/model_vqa_qbench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | from tqdm import tqdm
  4 | import json
  5 | 
  6 | from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
  7 | from mgm.conversation import conv_templates, SeparatorStyle
  8 | from mgm.model.builder import load_pretrained_model
  9 | from mgm.utils import disable_torch_init
 10 | from mgm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 11 | 
 12 | from PIL import Image
 13 | 
 14 | import requests
 15 | from PIL import Image
 16 | from io import BytesIO
 17 | 
 18 | 
 19 | def load_image(image_file):
 20 |     if image_file.startswith('http') or image_file.startswith('https'):
 21 |         response = requests.get(image_file)
 22 |         image = Image.open(BytesIO(response.content)).convert('RGB')
 23 |     else:
 24 |         image = Image.open(image_file).convert('RGB')
 25 |     return image
 26 | 
 27 | 
 28 | def eval_model(args):
 29 |     # Model
 30 |     disable_torch_init()
 31 | 
 32 |     model_name = get_model_name_from_path(args.model_path)
 33 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True)
 34 | 
 35 | 
 36 | 
 37 |     
 38 |     with open(args.questions_file) as f:
 39 |         llvqa_data = json.load(f)  
 40 |         
 41 |     for i, llddata in enumerate(tqdm(llvqa_data)):
 42 |         filename = llddata["img_path"]
 43 |         if args.lang == "en":
 44 |             message = llddata["question"] + "\nChoose between one of the options as follows:\n"
 45 |         elif args.lang == "zh":
 46 |             message = llddata["question"] + "\在下列选项中选择一个:\n"
 47 |         else:
 48 |             raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert  Q-Bench into more languages.")
 49 |         for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]):
 50 |             message += f"{choice} {ans}\n"
 51 |         qs = message
 52 |         
 53 |         if model.config.mm_use_im_start_end:
 54 |             qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
 55 |         else:
 56 |             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 57 | 
 58 |         if 'llama-2' in model_name.lower():
 59 |             conv_mode = "llava_llama_2"
 60 |         elif "v1" in model_name.lower():
 61 |             conv_mode = "llava_v1"
 62 |         elif "mpt" in model_name.lower():
 63 |             conv_mode = "mpt"
 64 |         else:
 65 |             conv_mode = "llava_v0"
 66 | 
 67 |         if args.conv_mode is not None and conv_mode != args.conv_mode:
 68 |             print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
 69 |         else:
 70 |             args.conv_mode = conv_mode
 71 | 
 72 |         conv = conv_templates[args.conv_mode].copy()
 73 |         conv.append_message(conv.roles[0], qs)
 74 |         conv.append_message(conv.roles[1], None)
 75 |         prompt = conv.get_prompt()
 76 | 
 77 |         image = load_image(args.image_folder + filename)
 78 |         image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
 79 | 
 80 |         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 81 | 
 82 |         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 83 |         keywords = [stop_str]
 84 |         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
 85 |         
 86 | 
 87 |         with torch.inference_mode():
 88 |             output_ids = model.generate(
 89 |                 input_ids,
 90 |                 images=image_tensor,
 91 |                 num_beams=1,
 92 |                 do_sample=False,
 93 |                 temperature=0,
 94 |                 max_new_tokens=1024,
 95 |                 use_cache=True,
 96 |                 stopping_criteria=[stopping_criteria])
 97 |         
 98 |         input_token_len = input_ids.shape[1]
 99 |         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
100 |         if n_diff_input_output > 0:
101 |             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
102 |         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
103 |         outputs = outputs.strip()
104 |         if outputs.endswith(stop_str):
105 |             outputs = outputs[:-len(stop_str)]
106 |         outputs = outputs.strip()
107 |         llddata["response"] = outputs
108 |         with open(args.answers_file, "a") as wf:
109 |             json.dump(llddata, wf)
110 | 
111 | if __name__ == "__main__":
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument("--model-path", type=str, default="llava-v1.5")
114 |     parser.add_argument("--model-base", type=str, default=None)
115 |     parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa")
116 |     parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json")
117 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
118 |     parser.add_argument("--conv-mode", type=str, default="llava_v1")
119 |     parser.add_argument("--lang", type=str, default="en")
120 |     args = parser.parse_args()
121 | 
122 |     eval_model(args)
123 | 


--------------------------------------------------------------------------------
/mgm/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | import concurrent.futures
 8 | 
 9 | import openai
10 | import tqdm
11 | import shortuuid
12 | 
13 | MODEL = 'gpt-3.5-turbo'
14 | MODEL_ID = 'gpt-3.5-turbo:20230327'
15 | 
16 | def get_answer(question_id: int, question: str, max_tokens: int):
17 |     ans = {
18 |         'answer_id': shortuuid.uuid(),
19 |         'question_id': question_id,
20 |         'model_id': MODEL_ID,
21 |     }
22 |     for _ in range(3):
23 |         try:
24 |             response = openai.ChatCompletion.create(
25 |                 model=MODEL,
26 |                 messages=[{
27 |                     'role': 'system',
28 |                     'content': 'You are a helpful assistant.'
29 |                 }, {
30 |                     'role': 'user',
31 |                     'content': question,
32 |                 }],
33 |                 max_tokens=max_tokens,
34 |             )
35 |             ans['text'] = response['choices'][0]['message']['content']
36 |             return ans
37 |         except Exception as e:
38 |             print('[ERROR]', e)
39 |             ans['text'] = '#ERROR#'
40 |             time.sleep(1)
41 |     return ans
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46 |     parser.add_argument('-q', '--question')
47 |     parser.add_argument('-o', '--output')
48 |     parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49 |     args = parser.parse_args()
50 | 
51 |     questions_dict = {}
52 |     with open(os.path.expanduser(args.question)) as f:
53 |         for line in f:
54 |             if not line:
55 |                 continue
56 |             q = json.loads(line)
57 |             questions_dict[q['question_id']] = q['text']
58 | 
59 |     answers = []
60 | 
61 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62 |         futures = []
63 |         for qid, question in questions_dict.items():
64 |             future = executor.submit(get_answer, qid, question, args.max_tokens)
65 |             futures.append(future)
66 | 
67 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68 |             answers.append(future.result())
69 | 
70 |     answers.sort(key=lambda x: x['question_id'])
71 | 
72 |     with open(os.path.expanduser(args.output), 'w') as f:
73 |         table = [json.dumps(ans) for ans in answers]
74 |         f.write('\n'.join(table))
75 | 


--------------------------------------------------------------------------------
/mgm/eval/run_llava.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | 
  4 | from mgm.constants import (
  5 |     IMAGE_TOKEN_INDEX,
  6 |     DEFAULT_IMAGE_TOKEN,
  7 |     DEFAULT_IM_START_TOKEN,
  8 |     DEFAULT_IM_END_TOKEN,
  9 |     IMAGE_PLACEHOLDER,
 10 | )
 11 | from mgm.conversation import conv_templates, SeparatorStyle
 12 | from mgm.model.builder import load_pretrained_model
 13 | from mgm.utils import disable_torch_init
 14 | from mgm.mm_utils import (
 15 |     process_images,
 16 |     tokenizer_image_token,
 17 |     get_model_name_from_path,
 18 | )
 19 | 
 20 | from PIL import Image
 21 | 
 22 | import requests
 23 | from PIL import Image
 24 | from io import BytesIO
 25 | import re
 26 | 
 27 | 
 28 | def image_parser(args):
 29 |     out = args.image_file.split(args.sep)
 30 |     return out
 31 | 
 32 | 
 33 | def load_image(image_file):
 34 |     if image_file.startswith("http") or image_file.startswith("https"):
 35 |         response = requests.get(image_file)
 36 |         image = Image.open(BytesIO(response.content)).convert("RGB")
 37 |     else:
 38 |         image = Image.open(image_file).convert("RGB")
 39 |     return image
 40 | 
 41 | 
 42 | def load_images(image_files):
 43 |     out = []
 44 |     for image_file in image_files:
 45 |         image = load_image(image_file)
 46 |         out.append(image)
 47 |     return out
 48 | 
 49 | 
 50 | def eval_model(args):
 51 |     # Model
 52 |     disable_torch_init()
 53 | 
 54 |     model_name = get_model_name_from_path(args.model_path)
 55 |     tokenizer, model, image_processor, context_len = load_pretrained_model(
 56 |         args.model_path, args.model_base, model_name
 57 |     )
 58 | 
 59 |     qs = args.query
 60 |     image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
 61 |     if IMAGE_PLACEHOLDER in qs:
 62 |         if model.config.mm_use_im_start_end:
 63 |             qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
 64 |         else:
 65 |             qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
 66 |     else:
 67 |         if model.config.mm_use_im_start_end:
 68 |             qs = image_token_se + "\n" + qs
 69 |         else:
 70 |             qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 71 | 
 72 |     if "llama-2" in model_name.lower():
 73 |         conv_mode = "llava_llama_2"
 74 |     elif "mistral" in model_name.lower():
 75 |         conv_mode = "mistral_instruct"
 76 |     elif "v1.6-34b" in model_name.lower():
 77 |         conv_mode = "chatml_direct"
 78 |     elif "v1" in model_name.lower():
 79 |         conv_mode = "llava_v1"
 80 |     elif "mpt" in model_name.lower():
 81 |         conv_mode = "mpt"
 82 |     else:
 83 |         conv_mode = "llava_v0"
 84 | 
 85 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 86 |         print(
 87 |             "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
 88 |                 conv_mode, args.conv_mode, args.conv_mode
 89 |             )
 90 |         )
 91 |     else:
 92 |         args.conv_mode = conv_mode
 93 | 
 94 |     conv = conv_templates[args.conv_mode].copy()
 95 |     conv.append_message(conv.roles[0], qs)
 96 |     conv.append_message(conv.roles[1], None)
 97 |     prompt = conv.get_prompt()
 98 | 
 99 |     image_files = image_parser(args)
100 |     images = load_images(image_files)
101 |     images_tensor = process_images(
102 |         images,
103 |         image_processor,
104 |         model.config
105 |     ).to(model.device, dtype=torch.float16)
106 | 
107 |     input_ids = (
108 |         tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
109 |         .unsqueeze(0)
110 |         .cuda()
111 |     )
112 | 
113 |     with torch.inference_mode():
114 |         output_ids = model.generate(
115 |             input_ids,
116 |             images=images_tensor,
117 |             do_sample=True if args.temperature > 0 else False,
118 |             temperature=args.temperature,
119 |             top_p=args.top_p,
120 |             num_beams=args.num_beams,
121 |             max_new_tokens=args.max_new_tokens,
122 |             use_cache=True,
123 |         )
124 | 
125 |     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
126 |     print(outputs)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
132 |     parser.add_argument("--model-base", type=str, default=None)
133 |     parser.add_argument("--image-file", type=str, required=True)
134 |     parser.add_argument("--query", type=str, required=True)
135 |     parser.add_argument("--conv-mode", type=str, default=None)
136 |     parser.add_argument("--sep", type=str, default=",")
137 |     parser.add_argument("--temperature", type=float, default=0.2)
138 |     parser.add_argument("--top_p", type=float, default=None)
139 |     parser.add_argument("--num_beams", type=int, default=1)
140 |     parser.add_argument("--max_new_tokens", type=int, default=512)
141 |     args = parser.parse_args()
142 | 
143 |     eval_model(args)


--------------------------------------------------------------------------------
/mgm/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict
 4 | 
 5 | import numpy as np
 6 | 
 7 | import argparse
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11 |     parser.add_argument('-d', '--dir', default=None)
12 |     parser.add_argument('-v', '--version', default=None)
13 |     parser.add_argument('-s', '--select', nargs='*', default=None)
14 |     parser.add_argument('-f', '--files', nargs='*', default=[])
15 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     args = parse_args()
21 | 
22 |     if args.ignore is not None:
23 |         args.ignore = [int(x) for x in args.ignore]
24 | 
25 |     if len(args.files) > 0:
26 |         review_files = args.files
27 |     else:
28 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29 | 
30 |     for review_file in sorted(review_files):
31 |         config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32 |         if args.select is not None and any(x not in config for x in args.select):
33 |             continue
34 |         if '0613' in config:
35 |             version = '0613'
36 |         else:
37 |             version = '0314'
38 |         if args.version is not None and args.version != version:
39 |             continue
40 |         scores = defaultdict(list)
41 |         print(config)
42 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43 |             for review_str in f:
44 |                 review = json.loads(review_str)
45 |                 if review['question_id'] in args.ignore:
46 |                     continue
47 |                 if 'category' in review:
48 |                     scores[review['category']].append(review['tuple'])
49 |                     scores['all'].append(review['tuple'])
50 |                 else:
51 |                     if 'tuple' in review:
52 |                         scores['all'].append(review['tuple'])
53 |                     else:
54 |                         scores['all'].append(review['score'])
55 |         for k, v in sorted(scores.items()):
56 |             stats = np.asarray(v).mean(0).tolist()
57 |             stats = [round(x, 3) for x in stats]
58 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
59 |             print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60 |         print('=================================')
61 | 


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/mgm/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/mgm/eval/webpage/styles.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  3 |     background-color: #f8f9fa;
  4 | }
  5 | 
  6 | .navbar-dark .navbar-nav .nav-link {
  7 |     color: #f1cf68;
  8 |     font-size: 1.1rem;
  9 |     padding: 0.5rem 0.6rem;
 10 | }
 11 | 
 12 | .card-header {
 13 |     font-weight: bold;
 14 | }
 15 | 
 16 | .card {
 17 |     box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
 18 |     transition: 0.3s;
 19 | }
 20 | 
 21 | .card:hover {
 22 |     box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
 23 | }
 24 | 
 25 | button {
 26 |     transition: background-color 0.3s;
 27 | }
 28 | 
 29 | button:hover {
 30 |     background-color: #007bff;
 31 | }
 32 | 
 33 | @media (max-width: 767px) {
 34 |     .form-row .form-group {
 35 |         margin-bottom: 10px;
 36 |     }
 37 | }
 38 | 
 39 | /* Extra styles */
 40 | 
 41 | .expandable-card .card-text-container {
 42 |     max-height: 200px;
 43 |     overflow-y: hidden;
 44 |     position: relative;
 45 | }
 46 | 
 47 | .expandable-card.expanded .card-text-container {
 48 |     max-height: none;
 49 | }
 50 | 
 51 | .expand-btn {
 52 |     position: relative;
 53 |     display: none;
 54 |     background-color: rgba(255, 255, 255, 0.8);
 55 |     color: #510c75;
 56 |     border-color: transparent;
 57 | }
 58 | 
 59 | .expand-btn:hover {
 60 |     background-color: rgba(200, 200, 200, 0.8);
 61 |     text-decoration: none;
 62 |     border-color: transparent;
 63 |     color: #510c75;
 64 | }
 65 | 
 66 | .expand-btn:focus {
 67 |     outline: none;
 68 |     text-decoration: none;
 69 | }
 70 | 
 71 | .expandable-card:not(.expanded) .card-text-container:after {
 72 |     content: "";
 73 |     position: absolute;
 74 |     bottom: 0;
 75 |     left: 0;
 76 |     width: 100%;
 77 |     height: 90px;
 78 |     background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
 79 | }
 80 | 
 81 | .expandable-card:not(.expanded) .expand-btn {
 82 |     margin-top: -40px;
 83 | }
 84 | 
 85 | .card-body {
 86 |     padding-bottom: 5px;
 87 | }
 88 | 
 89 | .vertical-flex-layout {
 90 |     justify-content: center;
 91 |     align-items: center;
 92 |     height: 100%;
 93 |     display: flex;
 94 |     flex-direction: column;
 95 |     gap: 5px;
 96 | }
 97 | 
 98 | .figure-img {
 99 |     max-width: 100%;
100 |     height: auto;
101 | }
102 | 
103 | .adjustable-font-size {
104 |     font-size: calc(0.5rem + 2vw);
105 | }
106 | 


--------------------------------------------------------------------------------
/mgm/mm_utils.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | from io import BytesIO
  3 | import base64
  4 | 
  5 | import torch
  6 | from transformers import StoppingCriteria
  7 | from mgm.constants import IMAGE_TOKEN_INDEX
  8 | 
  9 | 
 10 | def load_image_from_base64(image):
 11 |     return Image.open(BytesIO(base64.b64decode(image)))
 12 | 
 13 | 
 14 | def expand2square(pil_img, background_color):
 15 |     width, height = pil_img.size
 16 |     if width == height:
 17 |         return pil_img
 18 |     elif width > height:
 19 |         result = Image.new(pil_img.mode, (width, width), background_color)
 20 |         result.paste(pil_img, (0, (width - height) // 2))
 21 |         return result
 22 |     else:
 23 |         result = Image.new(pil_img.mode, (height, height), background_color)
 24 |         result.paste(pil_img, ((height - width) // 2, 0))
 25 |         return result
 26 | 
 27 | 
 28 | def process_images(images, image_processor, model_cfg):
 29 |     image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
 30 |     new_images = []
 31 |     if image_aspect_ratio == 'pad':
 32 |         for image in images:
 33 |             image = expand2square(image.convert('RGB'), tuple(int(x*255) for x in image_processor.image_mean))
 34 |             image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
 35 |             new_images.append(image)
 36 |     else:
 37 |         return image_processor(images, return_tensors='pt')['pixel_values']
 38 |     if all(x.shape == new_images[0].shape for x in new_images):
 39 |         new_images = torch.stack(new_images, dim=0)
 40 |     return new_images
 41 | 
 42 | 
 43 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
 44 |     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
 45 | 
 46 |     def insert_separator(X, sep):
 47 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
 48 | 
 49 |     input_ids = []
 50 |     offset = 0
 51 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
 52 |         offset = 1
 53 |         input_ids.append(prompt_chunks[0][0])
 54 | 
 55 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
 56 |         input_ids.extend(x[offset:])
 57 | 
 58 |     if return_tensors is not None:
 59 |         if return_tensors == 'pt':
 60 |             return torch.tensor(input_ids, dtype=torch.long)
 61 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
 62 |     return input_ids
 63 | 
 64 | 
 65 | def get_model_name_from_path(model_path):
 66 |     model_path = model_path.strip("/")
 67 |     model_paths = model_path.split("/")
 68 |     if model_paths[-1].startswith('checkpoint-'):
 69 |         return model_paths[-2] + "_" + model_paths[-1]
 70 |     else:
 71 |         return model_paths[-1]
 72 | 
 73 | class KeywordsStoppingCriteria(StoppingCriteria):
 74 |     def __init__(self, keywords, tokenizer, input_ids):
 75 |         self.keywords = keywords
 76 |         self.keyword_ids = []
 77 |         self.max_keyword_len = 0
 78 |         for keyword in keywords:
 79 |             cur_keyword_ids = tokenizer(keyword).input_ids
 80 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
 81 |                 cur_keyword_ids = cur_keyword_ids[1:]
 82 |             if len(cur_keyword_ids) > self.max_keyword_len:
 83 |                 self.max_keyword_len = len(cur_keyword_ids)
 84 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
 85 |         self.tokenizer = tokenizer
 86 |         self.start_len = input_ids.shape[1]
 87 |     
 88 |     def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 89 |         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
 90 |         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
 91 |         for keyword_id in self.keyword_ids:
 92 |             truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
 93 |             if torch.equal(truncated_output_ids, keyword_id):
 94 |                 return True
 95 |         outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
 96 |         for keyword in self.keywords:
 97 |             if keyword in outputs:
 98 |                 return True
 99 |         return False
100 |     
101 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
102 |         outputs = []
103 |         for i in range(output_ids.shape[0]):
104 |             outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
105 |         return all(outputs)


--------------------------------------------------------------------------------
/mgm/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.mgm_llama import MGMLlamaForCausalLM
2 | try:
3 |     from .language_model.mgm_mistral import MGMMistralForCausalLM
4 |     from .language_model.mgm_mixtral import MGMMixtralForCausalLM
5 |     from .language_model.mgm_gemma import MGMGemmaForCausalLM
6 | except:
7 |     ImportWarning("New model not imported. Try to update Transformers.")


--------------------------------------------------------------------------------
/mgm/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m mgm.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from mgm.model import *
10 | from mgm.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/mgm/model/language_model/mgm_mistral.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | # ------------------------------------------------------------------------
 15 | # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
 16 | # Copyright 2024 Yanwei Li
 17 | # ------------------------------------------------------------------------
 18 | 
 19 | from typing import List, Optional, Tuple, Union
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | 
 24 | from transformers import AutoConfig, AutoModelForCausalLM, \
 25 |                          MistralConfig, MistralModel, MistralForCausalLM
 26 | 
 27 | from transformers.modeling_outputs import CausalLMOutputWithPast
 28 | from transformers.generation.utils import GenerateOutput
 29 | from transformers.generation.utils import logging
 30 | 
 31 | from ..mgm_arch import MGMMetaModel, MGMMetaForCausalLM
 32 | 
 33 | logger = logging.get_logger(__name__)
 34 | 
 35 | class MGMConfig(MistralConfig):
 36 |     model_type = "mgm_mistral"
 37 | 
 38 | 
 39 | class MGMMistralModel(MGMMetaModel, MistralModel):
 40 |     config_class = MGMConfig
 41 |     
 42 |     def __init__(self, config: MistralConfig):
 43 |         super(MGMMistralModel, self).__init__(config)
 44 |         # self.max_pos_idx = 0
 45 | 
 46 | class MGMMistralForCausalLM(MistralForCausalLM, MGMMetaForCausalLM):
 47 |     config_class = MGMConfig
 48 | 
 49 |     def __init__(self, config):
 50 |         super(MistralForCausalLM, self).__init__(config)
 51 |         self.model = MGMMistralModel(config)
 52 |         # self.pretraining_tp = config.pretraining_tp
 53 |         self.vocab_size = config.vocab_size
 54 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 55 | 
 56 |         # Initialize weights and apply final processing
 57 |         self.post_init()
 58 | 
 59 |     def get_model(self):
 60 |         return self.model
 61 | 
 62 |     def forward(
 63 |         self,
 64 |         input_ids: torch.LongTensor = None,
 65 |         attention_mask: Optional[torch.Tensor] = None,
 66 |         position_ids: Optional[torch.LongTensor] = None,
 67 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 68 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 69 |         labels: Optional[torch.LongTensor] = None,
 70 |         use_cache: Optional[bool] = None,
 71 |         output_attentions: Optional[bool] = None,
 72 |         output_hidden_states: Optional[bool] = None,
 73 |         images: Optional[torch.FloatTensor] = None,
 74 |         images_aux: Optional[torch.FloatTensor] = None,
 75 |         return_dict: Optional[bool] = None,
 76 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 77 | 
 78 |         if inputs_embeds is None:
 79 |             (
 80 |                 input_ids,
 81 |                 position_ids,
 82 |                 attention_mask,
 83 |                 past_key_values,
 84 |                 inputs_embeds,
 85 |                 labels
 86 |             ) = self.prepare_inputs_labels_for_multimodal(
 87 |                 input_ids,
 88 |                 position_ids,
 89 |                 attention_mask,
 90 |                 past_key_values,
 91 |                 labels,
 92 |                 images,
 93 |                 images_aux
 94 |             )
 95 | 
 96 |         return super().forward(
 97 |             input_ids=input_ids,
 98 |             attention_mask=attention_mask,
 99 |             position_ids=position_ids,
100 |             past_key_values=past_key_values,
101 |             inputs_embeds=inputs_embeds,
102 |             labels=labels,
103 |             use_cache=use_cache,
104 |             output_attentions=output_attentions,
105 |             output_hidden_states=output_hidden_states,
106 |             return_dict=return_dict
107 |         )
108 | 
109 |     @torch.no_grad()
110 |     def generate(
111 |         self,
112 |         inputs: Optional[torch.Tensor] = None,
113 |         images: Optional[torch.Tensor] = None,
114 |         images_aux: Optional[torch.FloatTensor] = None,
115 |         **kwargs,
116 |     ) -> Union[GenerateOutput, torch.LongTensor]:
117 |         position_ids = kwargs.pop("position_ids", None)
118 |         attention_mask = kwargs.pop("attention_mask", None)
119 |         if "inputs_embeds" in kwargs:
120 |             raise NotImplementedError("`inputs_embeds` is not supported")
121 | 
122 |         if images is not None:
123 |             (
124 |                 inputs,
125 |                 position_ids,
126 |                 attention_mask,
127 |                 _,
128 |                 inputs_embeds,
129 |                 _
130 |             ) = self.prepare_inputs_labels_for_multimodal(
131 |                 inputs,
132 |                 position_ids,
133 |                 attention_mask,
134 |                 None,
135 |                 None,
136 |                 images,
137 |                 images_aux
138 |             )
139 |         else:
140 |             inputs_embeds = self.get_model().embed_tokens(inputs)
141 | 
142 |         return super().generate(
143 |             position_ids=position_ids,
144 |             attention_mask=attention_mask,
145 |             inputs_embeds=inputs_embeds,
146 |             **kwargs
147 |         )
148 | 
149 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
150 |         images = kwargs.pop("images", None)
151 |         images_aux = kwargs.pop("images_aux", None)
152 |         _inputs = super().prepare_inputs_for_generation(
153 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
154 |         )
155 |         if images is not None:
156 |             _inputs['images'] = images
157 |         if images_aux is not None:
158 |             _inputs['images_aux'] = images_aux
159 |         return _inputs
160 | 
161 | AutoConfig.register("mgm_mistral", MGMConfig)
162 | AutoModelForCausalLM.register(MGMConfig, MGMMistralForCausalLM)


--------------------------------------------------------------------------------
/mgm/model/language_model/mgm_mixtral.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 Haotian Liu
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | # ------------------------------------------------------------------------
 15 | # Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
 16 | # Copyright 2024 Yanwei Li
 17 | # ------------------------------------------------------------------------
 18 | 
 19 | from typing import List, Optional, Tuple, Union
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | 
 24 | from transformers import AutoConfig, AutoModelForCausalLM, \
 25 |                          MixtralConfig, MixtralModel, MixtralForCausalLM
 26 | 
 27 | from transformers.modeling_outputs import CausalLMOutputWithPast
 28 | from transformers.generation.utils import GenerateOutput
 29 | from transformers.generation.utils import logging
 30 | 
 31 | from ..mgm_arch import MGMMetaModel, MGMMetaForCausalLM
 32 | 
 33 | logger = logging.get_logger(__name__)
 34 | 
 35 | class MGMConfig(MixtralConfig):
 36 |     model_type = "mgm_mixtral"
 37 | 
 38 | 
 39 | class MGMMixtralModel(MGMMetaModel, MixtralModel):
 40 |     config_class = MGMConfig
 41 |     
 42 |     def __init__(self, config: MixtralConfig):
 43 |         super(MGMMixtralModel, self).__init__(config)
 44 |         # self.max_pos_idx = 0
 45 | 
 46 | class MGMMixtralForCausalLM(MixtralForCausalLM, MGMMetaForCausalLM):
 47 |     config_class = MGMConfig
 48 | 
 49 |     def __init__(self, config):
 50 |         super(MixtralForCausalLM, self).__init__(config)
 51 |         self.model = MGMMixtralModel(config)
 52 |         # self.pretraining_tp = config.pretraining_tp
 53 |         self.vocab_size = config.vocab_size
 54 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 55 | 
 56 |         # Initialize weights and apply final processing
 57 |         self.post_init()
 58 | 
 59 |     def get_model(self):
 60 |         return self.model
 61 | 
 62 |     def forward(
 63 |         self,
 64 |         input_ids: torch.LongTensor = None,
 65 |         attention_mask: Optional[torch.Tensor] = None,
 66 |         position_ids: Optional[torch.LongTensor] = None,
 67 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
 68 |         inputs_embeds: Optional[torch.FloatTensor] = None,
 69 |         labels: Optional[torch.LongTensor] = None,
 70 |         use_cache: Optional[bool] = None,
 71 |         output_attentions: Optional[bool] = None,
 72 |         output_hidden_states: Optional[bool] = None,
 73 |         images: Optional[torch.FloatTensor] = None,
 74 |         images_aux: Optional[torch.FloatTensor] = None,
 75 |         return_dict: Optional[bool] = None,
 76 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
 77 | 
 78 |         if inputs_embeds is None:
 79 |             (
 80 |                 input_ids,
 81 |                 position_ids,
 82 |                 attention_mask,
 83 |                 past_key_values,
 84 |                 inputs_embeds,
 85 |                 labels
 86 |             ) = self.prepare_inputs_labels_for_multimodal(
 87 |                 input_ids,
 88 |                 position_ids,
 89 |                 attention_mask,
 90 |                 past_key_values,
 91 |                 labels,
 92 |                 images,
 93 |                 images_aux
 94 |             )        
 95 | 
 96 |         return super().forward(
 97 |             input_ids=input_ids,
 98 |             attention_mask=attention_mask,
 99 |             position_ids=position_ids,
100 |             past_key_values=past_key_values,
101 |             inputs_embeds=inputs_embeds,
102 |             labels=labels,
103 |             use_cache=use_cache,
104 |             output_attentions=output_attentions,
105 |             output_hidden_states=output_hidden_states,
106 |             return_dict=return_dict
107 |         )
108 | 
109 |     @torch.no_grad()
110 |     def generate(
111 |         self,
112 |         inputs: Optional[torch.Tensor] = None,
113 |         images: Optional[torch.Tensor] = None,
114 |         images_aux: Optional[torch.FloatTensor] = None,
115 |         **kwargs,
116 |     ) -> Union[GenerateOutput, torch.LongTensor]:
117 |         position_ids = kwargs.pop("position_ids", None)
118 |         attention_mask = kwargs.pop("attention_mask", None)
119 |         if "inputs_embeds" in kwargs:
120 |             raise NotImplementedError("`inputs_embeds` is not supported")
121 | 
122 |         if images is not None:
123 |             (
124 |                 inputs,
125 |                 position_ids,
126 |                 attention_mask,
127 |                 _,
128 |                 inputs_embeds,
129 |                 _
130 |             ) = self.prepare_inputs_labels_for_multimodal(
131 |                 inputs,
132 |                 position_ids,
133 |                 attention_mask,
134 |                 None,
135 |                 None,
136 |                 images,
137 |                 images_aux
138 |             )
139 |         else:
140 |             inputs_embeds = self.get_model().embed_tokens(inputs)
141 | 
142 |         return super().generate(
143 |             position_ids=position_ids,
144 |             attention_mask=attention_mask,
145 |             inputs_embeds=inputs_embeds,
146 |             **kwargs
147 |         )
148 | 
149 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
150 |         images = kwargs.pop("images", None)
151 |         images_aux = kwargs.pop("images_aux", None)
152 |         _inputs = super().prepare_inputs_for_generation(
153 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
154 |         )
155 |         if images is not None:
156 |             _inputs['images'] = images
157 |         if images_aux is not None:
158 |             _inputs['images_aux'] = images_aux
159 |         return _inputs
160 | 
161 | AutoConfig.register("mgm_mixtral", MGMConfig)
162 | AutoModelForCausalLM.register(MGMConfig, MGMMixtralForCausalLM)


--------------------------------------------------------------------------------
/mgm/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | from .eva_encoder import EVAVisionTower
 4 | from .openclip_encoder import OpenCLIPVisionTower
 5 | 
 6 | 
 7 | def build_vision_tower(vision_tower_cfg, **kwargs):
 8 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 9 |     image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "../processor/clip-patch14-224"))
10 |     
11 |     if not os.path.exists(vision_tower):
12 |         raise ValueError(f'Not find vision tower: {vision_tower}')
13 | 
14 |     if "openai" in vision_tower.lower() or "ShareGPT4V" in vision_tower:
15 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 |     elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower():
17 |         return EVAVisionTower(vision_tower, image_processor, args=vision_tower_cfg, **kwargs)
18 |     else:
19 |         raise ValueError(f'Unknown vision tower: {vision_tower}')
20 | 
21 | 
22 | def build_vision_tower_aux(vision_tower_cfg, **kwargs):
23 |     vision_tower_aux = getattr(vision_tower_cfg, 'mm_vision_tower_aux', getattr(vision_tower_cfg, 'vision_tower_aux', None))
24 |     
25 |     if not os.path.exists(vision_tower_aux):
26 |         raise ValueError(f'Not find vision tower: {vision_tower_aux}')
27 | 
28 |     if "openclip" in vision_tower_aux.lower():
29 |         return OpenCLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
30 |     elif "openai" in vision_tower_aux.lower():
31 |         return CLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
32 |     else:
33 |         raise ValueError(f'Unknown vision tower: {vision_tower_aux}')


--------------------------------------------------------------------------------
/mgm/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
 5 | from ..processor.video_processor import VideoFramesProcessor
 6 | 
 7 | class CLIPVisionTower(nn.Module):
 8 |     def __init__(self, vision_tower, args, delay_load=False):
 9 |         super().__init__()
10 | 
11 |         self.is_loaded = False
12 | 
13 |         self.vision_tower_name = vision_tower
14 |         self.select_layer = args.mm_vision_select_layer
15 |         self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16 |         self.is_optimize = getattr(args, 'optimize_vision_tower', False)
17 |         
18 |         if not delay_load:
19 |             self.load_model()
20 |         elif getattr(args, 'unfreeze_mm_vision_tower', False):
21 |             self.load_model()
22 |         else:
23 |             self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
24 | 
25 |     def load_model(self):
26 |         self.image_processor = VideoFramesProcessor.from_pretrained(self.vision_tower_name)
27 |         self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
28 |         self.vision_tower.requires_grad_(False)
29 | 
30 |         self.is_loaded = True
31 | 
32 |     def feature_select(self, image_forward_outs):
33 |         image_features = image_forward_outs.hidden_states[self.select_layer]
34 |         if self.select_feature == 'patch':
35 |             image_features = image_features[:, 1:]
36 |         elif self.select_feature == 'cls_patch':
37 |             image_features = image_features
38 |         else:
39 |             raise ValueError(f'Unexpected select feature: {self.select_feature}')
40 |         return image_features
41 | 
42 |     def image_forward(self, images):
43 |         if type(images) is list:
44 |             image_features = []
45 |             for image in images:
46 |                 image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
47 |                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
48 |                 image_features.append(image_feature)
49 |         else:
50 |             image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
51 |             image_features = self.feature_select(image_forward_outs).to(images.dtype)
52 |         
53 |         return image_features
54 | 
55 |     def forward(self, images):
56 |         if not self.is_optimize:
57 |             with torch.no_grad():
58 |                 image_features = self.image_forward(images)
59 |         else:
60 |             image_features = self.image_forward(images)
61 | 
62 |         return image_features
63 | 
64 |     @property
65 |     def dummy_feature(self):
66 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
67 | 
68 |     @property
69 |     def dtype(self):
70 |         return self.vision_tower.dtype
71 | 
72 |     @property
73 |     def device(self):
74 |         return self.vision_tower.device
75 | 
76 |     @property
77 |     def config(self):
78 |         if self.is_loaded:
79 |             return self.vision_tower.config
80 |         else:
81 |             return self.cfg_only
82 | 
83 |     @property
84 |     def hidden_size(self):
85 |         return self.config.hidden_size
86 | 
87 |     @property
88 |     def num_patches(self):
89 |         return (self.config.image_size // self.config.patch_size) ** 2


--------------------------------------------------------------------------------
/mgm/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | class IdentityMap(nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def forward(self, x, *args, **kwargs):
10 |         return x
11 | 
12 |     @property
13 |     def config(self):
14 |         return {"mm_projector_type": 'identity'}
15 | 
16 | 
17 | class SimpleResBlock(nn.Module):
18 |     def __init__(self, channels):
19 |         super().__init__()
20 |         self.pre_norm = nn.LayerNorm(channels)
21 | 
22 |         self.proj = nn.Sequential(
23 |             nn.Linear(channels, channels),
24 |             nn.GELU(),
25 |             nn.Linear(channels, channels)
26 |         )
27 |     def forward(self, x):
28 |         x = self.pre_norm(x)
29 |         return x + self.proj(x)
30 | 
31 | 
32 | def build_vision_projector(config, delay_load=False, **kwargs):
33 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
34 | 
35 |     if projector_type == 'linear':
36 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
37 | 
38 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
39 |     if mlp_gelu_match:
40 |         mlp_depth = int(mlp_gelu_match.group(1))
41 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
42 |         for _ in range(1, mlp_depth):
43 |             modules.append(nn.GELU())
44 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
45 |         return nn.Sequential(*modules)
46 | 
47 |     if projector_type == 'identity':
48 |         return IdentityMap()
49 | 
50 |     raise ValueError(f'Unknown projector type: {projector_type}')


--------------------------------------------------------------------------------
/mgm/model/processor/video_processor.py:
--------------------------------------------------------------------------------
 1 | from transformers import CLIPImageProcessor
 2 | from transformers.image_processing_utils import BatchFeature, get_size_dict
 3 | from transformers.image_transforms import get_resize_output_image_size
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | class VideoFramesProcessor(CLIPImageProcessor):
12 | 
13 |     def __init__(self, **kwargs):
14 |         super().__init__(**kwargs)
15 | 
16 |     def preprocess(self, images, **kwargs):
17 |         if not isinstance(images, np.ndarray):
18 |             return super().preprocess(images=images, **kwargs)
19 |         
20 |         do_resize = kwargs.get('do_resize', self.do_resize)
21 |         size = kwargs.get('size', self.size)
22 |         size = get_size_dict(size, param_name="size", default_to_square=False)
23 |         do_center_crop = kwargs.get('do_center_crop', self.do_center_crop)
24 |         crop_size = kwargs.get('crop_size', self.crop_size)
25 |         crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
26 |         do_rescale = kwargs.get('do_rescale', self.do_rescale)
27 |         rescale_factor = kwargs.get('rescale_factor', self.rescale_factor)
28 |         do_normalize = kwargs.get('do_normalize', self.do_normalize)
29 |         image_mean = kwargs.get('image_mean', self.image_mean)
30 |         image_std = kwargs.get('image_std', self.image_std)
31 |         return_tensors = kwargs.get('return_tensors', None)
32 | 
33 |         def resize(images, output_size):
34 |             images = images.permute((0, 3, 1, 2))
35 |             images = F.interpolate(images, size=output_size, mode='bicubic')
36 |             images = images.permute((0, 2, 3, 1))
37 |             return images
38 | 
39 |         def center_crop(images, crop_size):
40 |             crop_width, crop_height = crop_size["width"], crop_size["height"]
41 |             img_width, img_height = images.shape[1:3]
42 |             x = (img_width - crop_width) // 2
43 |             y = (img_height - crop_height) // 2
44 |             images = images[:, x:x+crop_width, y:y+crop_height]
45 |             return images
46 |         
47 |         def rescale(images, rescale_factor):
48 |             images = images * rescale_factor
49 |             return images
50 |         
51 |         def normalize(images, mean, std):
52 |             mean = torch.tensor(mean)
53 |             std = torch.tensor(std)
54 |             images = (images - mean) / std
55 |             return images
56 | 
57 |         images = torch.from_numpy(images).float()
58 | 
59 |         if do_resize:
60 |             output_size = get_resize_output_image_size(images[0], size=size["shortest_edge"], default_to_square=False)
61 |             images = resize(images, output_size)
62 |         
63 |         if do_center_crop:
64 |             images = center_crop(images, crop_size)
65 |         
66 |         if do_rescale:
67 |             images = rescale(images, rescale_factor)
68 |         
69 |         if do_normalize:
70 |             images = normalize(images, image_mean, image_std)
71 | 
72 |         images = images.permute((0, 3, 1, 2))
73 |         data = {"pixel_values": images}
74 |         return BatchFeature(data=data, tensor_type=return_tensors)
75 | 


--------------------------------------------------------------------------------
/mgm/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/__init__.py


--------------------------------------------------------------------------------
/mgm/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/mgm/serve/examples/monday.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/monday.jpg


--------------------------------------------------------------------------------
/mgm/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/mgm/serve/examples/woolen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/MGM/769820cb71afc7c2b9a2166ca3ce92db1636215c/mgm/serve/examples/woolen.png


--------------------------------------------------------------------------------
/mgm/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/mgm/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from mgm.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/mgm/train/llama_xformers_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
  3 | """
  4 | 
  5 | import logging
  6 | import math
  7 | from typing import Optional, Tuple
  8 | 
  9 | import torch
 10 | import transformers.models.llama.modeling_llama
 11 | from torch import nn
 12 | 
 13 | try:
 14 |     import xformers.ops
 15 | except ImportError:
 16 |     logging.error("xformers not found! Please install it before trying to use it.")
 17 | 
 18 | 
 19 | def replace_llama_attn_with_xformers_attn():
 20 |     transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
 21 | 
 22 | 
 23 | def xformers_forward(
 24 |     self,
 25 |     hidden_states: torch.Tensor,
 26 |     attention_mask: Optional[torch.Tensor] = None,
 27 |     position_ids: Optional[torch.LongTensor] = None,
 28 |     past_key_value: Optional[Tuple[torch.Tensor]] = None,
 29 |     output_attentions: bool = False,
 30 |     use_cache: bool = False,
 31 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 32 |     # pylint: disable=duplicate-code
 33 |     bsz, q_len, _ = hidden_states.size()
 34 | 
 35 |     query_states = (
 36 |         self.q_proj(hidden_states)
 37 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 38 |         .transpose(1, 2)
 39 |     )
 40 |     key_states = (
 41 |         self.k_proj(hidden_states)
 42 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 43 |         .transpose(1, 2)
 44 |     )
 45 |     value_states = (
 46 |         self.v_proj(hidden_states)
 47 |         .view(bsz, q_len, self.num_heads, self.head_dim)
 48 |         .transpose(1, 2)
 49 |     )
 50 | 
 51 |     kv_seq_len = key_states.shape[-2]
 52 |     if past_key_value is not None:
 53 |         kv_seq_len += past_key_value[0].shape[-2]
 54 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 55 |     (
 56 |         query_states,
 57 |         key_states,
 58 |     ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
 59 |         query_states, key_states, cos, sin, position_ids
 60 |     )
 61 |     # [bsz, nh, t, hd]
 62 | 
 63 |     if past_key_value is not None:
 64 |         # reuse k, v, self_attention
 65 |         key_states = torch.cat([past_key_value[0], key_states], dim=2)
 66 |         value_states = torch.cat([past_key_value[1], value_states], dim=2)
 67 | 
 68 |     past_key_value = (key_states, value_states) if use_cache else None
 69 | 
 70 |     # We only apply xformers optimizations if we don't need to output the whole attention matrix
 71 |     if not output_attentions:
 72 |         query_states = query_states.transpose(1, 2)
 73 |         key_states = key_states.transpose(1, 2)
 74 |         value_states = value_states.transpose(1, 2)
 75 | 
 76 |         # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
 77 |         # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
 78 |         if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
 79 |             # input and output should be of form (bsz, q_len, num_heads, head_dim)
 80 |             attn_output = xformers.ops.memory_efficient_attention(
 81 |                 query_states, key_states, value_states, attn_bias=None
 82 |             )
 83 |         else:
 84 |             # input and output should be of form (bsz, q_len, num_heads, head_dim)
 85 |             attn_output = xformers.ops.memory_efficient_attention(
 86 |                 query_states,
 87 |                 key_states,
 88 |                 value_states,
 89 |                 attn_bias=xformers.ops.LowerTriangularMask(),
 90 |             )
 91 |         attn_weights = None
 92 |     else:
 93 |         attn_weights = torch.matmul(
 94 |             query_states, key_states.transpose(2, 3)
 95 |         ) / math.sqrt(self.head_dim)
 96 | 
 97 |         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
 98 |             raise ValueError(
 99 |                 f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
100 |                 f" {attn_weights.size()}"
101 |             )
102 | 
103 |         if attention_mask is not None:
104 |             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
105 |                 raise ValueError(
106 |                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
107 |                 )
108 |             attn_weights = attn_weights + attention_mask
109 |             attn_weights = torch.max(
110 |                 attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
111 |             )
112 | 
113 |         # upcast attention to fp32
114 |         attn_weights = nn.functional.softmax(
115 |             attn_weights, dim=-1, dtype=torch.float32
116 |         ).to(query_states.dtype)
117 |         attn_output = torch.matmul(attn_weights, value_states)
118 | 
119 |         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
120 |             raise ValueError(
121 |                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
122 |                 f" {attn_output.size()}"
123 |             )
124 | 
125 |         attn_output = attn_output.transpose(1, 2)
126 | 
127 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
128 |     attn_output = self.o_proj(attn_output)
129 |     return attn_output, attn_weights, past_key_value
130 | 


--------------------------------------------------------------------------------
/mgm/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from mgm.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")


--------------------------------------------------------------------------------
/mgm/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from mgm.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from mgm.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/mgm/utils.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import logging.handlers
  4 | import os
  5 | import sys
  6 | 
  7 | import requests
  8 | 
  9 | from mgm.constants import LOGDIR
 10 | 
 11 | server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 12 | moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 13 | 
 14 | handler = None
 15 | 
 16 | 
 17 | def build_logger(logger_name, logger_filename):
 18 |     global handler
 19 | 
 20 |     formatter = logging.Formatter(
 21 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 22 |         datefmt="%Y-%m-%d %H:%M:%S",
 23 |     )
 24 | 
 25 |     # Set the format of root handlers
 26 |     if not logging.getLogger().handlers:
 27 |         logging.basicConfig(level=logging.INFO)
 28 |     logging.getLogger().handlers[0].setFormatter(formatter)
 29 | 
 30 |     # Redirect stdout and stderr to loggers
 31 |     stdout_logger = logging.getLogger("stdout")
 32 |     stdout_logger.setLevel(logging.INFO)
 33 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 34 |     sys.stdout = sl
 35 | 
 36 |     stderr_logger = logging.getLogger("stderr")
 37 |     stderr_logger.setLevel(logging.ERROR)
 38 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 39 |     sys.stderr = sl
 40 | 
 41 |     # Get logger
 42 |     logger = logging.getLogger(logger_name)
 43 |     logger.setLevel(logging.INFO)
 44 | 
 45 |     # Add a file handler for all loggers
 46 |     if handler is None:
 47 |         os.makedirs(LOGDIR, exist_ok=True)
 48 |         filename = os.path.join(LOGDIR, logger_filename)
 49 |         handler = logging.handlers.TimedRotatingFileHandler(
 50 |             filename, when='D', utc=True, encoding='UTF-8')
 51 |         handler.setFormatter(formatter)
 52 | 
 53 |         for name, item in logging.root.manager.loggerDict.items():
 54 |             if isinstance(item, logging.Logger):
 55 |                 item.addHandler(handler)
 56 | 
 57 |     return logger
 58 | 
 59 | 
 60 | class StreamToLogger(object):
 61 |     """
 62 |     Fake file-like stream object that redirects writes to a logger instance.
 63 |     """
 64 |     def __init__(self, logger, log_level=logging.INFO):
 65 |         self.terminal = sys.stdout
 66 |         self.logger = logger
 67 |         self.log_level = log_level
 68 |         self.linebuf = ''
 69 | 
 70 |     def __getattr__(self, attr):
 71 |         return getattr(self.terminal, attr)
 72 | 
 73 |     def write(self, buf):
 74 |         temp_linebuf = self.linebuf + buf
 75 |         self.linebuf = ''
 76 |         for line in temp_linebuf.splitlines(True):
 77 |             # From the io.TextIOWrapper docs:
 78 |             #   On output, if newline is None, any '\n' characters written
 79 |             #   are translated to the system default line separator.
 80 |             # By default sys.stdout.write() expects '\n' newlines and then
 81 |             # translates them so this is still cross platform.
 82 |             if line[-1] == '\n':
 83 |                 self.logger.log(self.log_level, line.rstrip())
 84 |             else:
 85 |                 self.linebuf += line
 86 | 
 87 |     def flush(self):
 88 |         if self.linebuf != '':
 89 |             self.logger.log(self.log_level, self.linebuf.rstrip())
 90 |         self.linebuf = ''
 91 | 
 92 | 
 93 | def disable_torch_init():
 94 |     """
 95 |     Disable the redundant torch default initialization to accelerate model creation.
 96 |     """
 97 |     import torch
 98 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 99 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
100 | 
101 | 
102 | def violates_moderation(text):
103 |     """
104 |     Check whether the text violates OpenAI moderation API.
105 |     """
106 |     url = "https://api.openai.com/v1/moderations"
107 |     headers = {"Content-Type": "application/json",
108 |                "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
109 |     text = text.replace("\n", "")
110 |     data = "{" + '"input": ' + f'"{text}"' + "}"
111 |     data = data.encode("utf-8")
112 |     try:
113 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
114 |         flagged = ret.json()["results"][0]["flagged"]
115 |     except requests.exceptions.RequestException as e:
116 |         flagged = False
117 |     except KeyError as e:
118 |         flagged = False
119 | 
120 |     return flagged
121 | 
122 | 
123 | def pretty_print_semaphore(semaphore):
124 |     if semaphore is None:
125 |         return "None"
126 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
127 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mgm"
 7 | version = "1.0.0"
 8 | description = "Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.0.1", "torchvision==0.15.2",
17 |     "transformers==4.36.2", "tokenizers==0.15.0", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0",
19 |     "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
20 |     "gradio==3.35.2", "gradio_client==0.2.9",
21 |     "requests", "httpx==0.24.0", "uvicorn", "fastapi",
22 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.9.16",
23 | ]
24 | 
25 | [project.optional-dependencies]
26 | train = ["deepspeed==0.11.1", "ninja", "wandb"]
27 | build = ["build", "twine"]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://github.com/dvlab-research/MGM"
31 | "Bug Tracker" = "https://github.com/dvlab-research/MGM/issues"
32 | 
33 | [tool.setuptools.packages.find]
34 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "data*", "model_zoo*", "work_dirs*", "project*"]
35 | 
36 | [tool.wheel]
37 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*", "data*", "model_zoo*", "work_dirs*", "project*"]
38 | 


--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 |     res = json.loads(line)
13 |     question_id = res['question_id']
14 |     text = res['text'].rstrip('.').lower()
15 |     all_answers.append({"questionId": question_id, "prediction": text})
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(all_answers, f)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str, required=True)
 9 |     parser.add_argument("--result-dir", type=str, required=True)
10 |     parser.add_argument("--upload-dir", type=str, required=True)
11 |     parser.add_argument("--experiment", type=str, required=True)
12 | 
13 |     return parser.parse_args()
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 | 
18 |     df = pd.read_table(args.annotation_file)
19 | 
20 |     cur_df = df.copy()
21 |     cur_df = cur_df.drop(columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
22 |     cur_df.insert(6, 'prediction', None)
23 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
24 |         pred = json.loads(pred)
25 |         cur_df.loc[df['index'] == pred['question_id'], 'prediction'] = pred['text']
26 | 
27 |     cur_df.to_excel(os.path.join(args.upload_dir, f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
28 | 


--------------------------------------------------------------------------------
/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--annotation-file", type=str)
 9 |     parser.add_argument("--result-file", type=str)
10 |     parser.add_argument("--result-upload-file", type=str)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type: continue
24 |         data_type = question_data['question_type_id']
25 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
26 |         try:
27 |             question_id = int(question_data['question_id'])
28 |         except:
29 |             question_id = question_data['question_id']
30 |         if question_id not in results:
31 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
32 |             continue
33 |         row = results[question_id]
34 |         if row['text'] == question_data['answer']:
35 |             correct_counts[data_type] = correct_counts.get(data_type, 0) + 1
36 | 
37 |     total_count = 0
38 |     total_correct = 0
39 |     for data_type in sorted(type_counts.keys()):
40 |         accuracy = correct_counts[data_type] / type_counts[data_type] * 100
41 |         if eval_only_type is None:
42 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
43 | 
44 |         total_count += type_counts[data_type]
45 |         total_correct += correct_counts[data_type]
46 | 
47 |     total_accuracy = total_correct / total_count * 100
48 |     if eval_only_type is None:
49 |         print(f"Total accuracy: {total_accuracy:.2f}%")
50 |     else:
51 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
52 | 
53 |     return results
54 | 
55 | if __name__ == "__main__":
56 |     args = get_args()
57 |     data = json.load(open(args.annotation_file))
58 |     ques_type_id_to_name = {id:n for n,id in data['question_type'].items()}
59 | 
60 |     results = eval_single(args.result_file)
61 |     eval_single(args.result_file, eval_only_type='image')
62 |     eval_single(args.result_file, eval_only_type='video')
63 | 
64 |     with open(args.result_upload_file, 'w') as fp:
65 |         for question in data['questions']:
66 |             qid = question['question_id']
67 |             if qid in results:
68 |                 result = results[qid]
69 |             else:
70 |                 result = results[int(qid)]
71 |             fp.write(json.dumps({
72 |                 'question_id': qid,
73 |                 'prediction': result['text']
74 |             }) + '\n')
75 | 


--------------------------------------------------------------------------------
/scripts/extract_mm_projector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is just a utility that I use to extract the projector for quantized models.
 3 | It is NOT necessary at all to train, or run inference/serve demos.
 4 | Use this script ONLY if you fully understand its implications.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | import argparse
10 | import torch
11 | import json
12 | from collections import defaultdict
13 | 
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='Extract MMProjector weights')
17 |     parser.add_argument('--model-path', type=str, help='model folder')
18 |     parser.add_argument('--output', type=str, help='output file')
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parse_args()
25 | 
26 |     keys_to_match = ['mm_projector']
27 |     ckpt_to_key = defaultdict(list)
28 |     try:
29 |         model_indices = json.load(open(os.path.join(args.model_path, 'pytorch_model.bin.index.json')))
30 |         for k, v in model_indices['weight_map'].items():
31 |             if any(key_match in k for key_match in keys_to_match):
32 |                 ckpt_to_key[v].append(k)
33 |     except FileNotFoundError:
34 |         # Smaller models or model checkpoints saved by DeepSpeed.
35 |         v = 'pytorch_model.bin'
36 |         for k in torch.load(os.path.join(args.model_path, v), map_location='cpu').keys():
37 |             if any(key_match in k for key_match in keys_to_match):
38 |                 ckpt_to_key[v].append(k)
39 | 
40 |     loaded_weights = {}
41 | 
42 |     for ckpt_name, weight_keys in ckpt_to_key.items():
43 |         ckpt = torch.load(os.path.join(args.model_path, ckpt_name), map_location='cpu')
44 |         for k in weight_keys:
45 |             loaded_weights[k] = ckpt[k]
46 | 
47 |     torch.save(loaded_weights, args.output)
48 | 


--------------------------------------------------------------------------------
/scripts/gemma/eval/math_vista.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-2B"
 9 | OPENAIKEY=""
10 | OPENAIBASE=""
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \
14 |         --model-path work_dirs/$CKPT \
15 |         --question-file data/MGM-Eval/MathVista/testmini.json \
16 |         --image-folder data/MGM-Eval/MathVista \
17 |         --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode gemma &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl
27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MathVista/extract_answer.py \
38 |     --output_file $output_file \
39 |     --api_key $OPENAIKEY \
40 |     --api_base $OPENAIBASE
41 | 
42 | python mgm/eval/MathVista/calculate_score.py \
43 |     --output_file $output_file \
44 |     --score_file $score_file \
45 |     --gt_file data/MGM-Eval/MathVista/testmini.json
46 | 


--------------------------------------------------------------------------------
/scripts/gemma/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-2B"
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_mmbench \
 7 |     --model-path ./work_dirs/$CKPT \
 8 |     --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode gemma 
13 | 
14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment $CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/gemma/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT="MGM/MGM-2B"
 3 | 
 4 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \
 5 |     --model-path work_dirs/$CKPT \
 6 |     --question-file data/MGM-Eval/MME/llava_mme.jsonl \
 7 |     --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \
 8 |     --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode gemma
11 | 
12 | cd data/MGM-Eval/MME
13 | 
14 | python convert_answer_to_mme.py --experiment $CKPT
15 | 
16 | cd eval_tool
17 | 
18 | python calculation.py --results_dir answers/$CKPT
19 | 


--------------------------------------------------------------------------------
/scripts/gemma/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-2B"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "validation" \
21 |         --conv-mode gemma &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json


--------------------------------------------------------------------------------
/scripts/gemma/eval/mmmu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-2B"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "test" \
21 |         --conv-mode gemma &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | 
37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json


--------------------------------------------------------------------------------
/scripts/gemma/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-2B"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \
12 |     --model-path work_dirs/$CKPT \
13 |     --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \
14 |     --image-folder data/MGM-Eval/mm-vet/images \
15 |     --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |     --num-chunks $CHUNKS \
17 |     --chunk-idx $IDX \
18 |     --temperature 0 \
19 |     --conv-mode gemma &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl
25 | # Clear out the output file if it exists.
26 | > "$output_file"
27 | 
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT
34 | 
35 | python scripts/convert_mmvet_for_eval.py \
36 |     --src $output_file \
37 |     --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/gemma/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 |  
 9 | CKPT="MGM/MGM-2B"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \
13 |         --model-path ./work_dirs/$CKPT \
14 |         --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
15 |         --image-folder ./data/MGM-Eval/textvqa/train_images \
16 |         --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode gemma &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python -m mgm.eval.eval_textvqa \
36 |     --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \
37 |     --result-file $output_file
38 | 


--------------------------------------------------------------------------------
/scripts/gemma/train/stage_1_2_full_gemma_v2b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-2B-Pretrain
 3 | FINETUNE_NAME=MGM-2B
 4 | AUX_SIZE=768
 5 | 
 6 | deepspeed mgm/train/train_mem.py \
 7 |     --deepspeed ./scripts/zero2_offload.json \
 8 |     --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \
 9 |     --version gemma \
10 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
11 |     --image_folder ./data/MGM-Pretrain \
12 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
13 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
14 |     --mm_projector_type mlp2x_gelu \
15 |     --tune_mm_mlp_adapter True \
16 |     --mm_vision_select_layer -2 \
17 |     --mm_use_im_start_end False \
18 |     --mm_use_im_patch_token False \
19 |     --image_size_aux $AUX_SIZE \
20 |     --bf16 True \
21 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
22 |     --num_train_epochs 1 \
23 |     --per_device_train_batch_size 8 \
24 |     --per_device_eval_batch_size 4 \
25 |     --gradient_accumulation_steps 4 \
26 |     --evaluation_strategy "no" \
27 |     --save_strategy "steps" \
28 |     --save_steps 24000 \
29 |     --save_total_limit 1 \
30 |     --learning_rate 1e-3 \
31 |     --weight_decay 0. \
32 |     --warmup_ratio 0.03 \
33 |     --lr_scheduler_type "cosine" \
34 |     --logging_steps 1 \
35 |     --tf32 True \
36 |     --model_max_length 2048 \
37 |     --gradient_checkpointing True \
38 |     --dataloader_num_workers 4 \
39 |     --lazy_preprocess True \
40 |     --report_to wandb
41 | 
42 | 
43 | deepspeed mgm/train/train_mem.py \
44 |     --deepspeed ./scripts/zero2.json \
45 |     --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \
46 |     --version gemma \
47 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
48 |     --image_folder ./data/MGM-Finetune \
49 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
50 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
51 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
52 |     --mm_projector_type mlp2x_gelu \
53 |     --mm_vision_select_layer -2 \
54 |     --mm_use_im_start_end False \
55 |     --mm_use_im_patch_token False \
56 |     --image_aspect_ratio pad \
57 |     --group_by_modality_length True \
58 |     --image_size_aux $AUX_SIZE \
59 |     --bf16 True \
60 |     --output_dir ./work_dirs/$FINETUNE_NAME \
61 |     --num_train_epochs 1 \
62 |     --per_device_train_batch_size 8 \
63 |     --per_device_eval_batch_size 4 \
64 |     --gradient_accumulation_steps 2 \
65 |     --evaluation_strategy "no" \
66 |     --save_strategy "steps" \
67 |     --save_steps 1000 \
68 |     --save_total_limit 1 \
69 |     --learning_rate 2e-5 \
70 |     --weight_decay 0. \
71 |     --warmup_ratio 0.03 \
72 |     --lr_scheduler_type "cosine" \
73 |     --logging_steps 1 \
74 |     --tf32 True \
75 |     --model_max_length 2048 \
76 |     --gradient_checkpointing True \
77 |     --dataloader_num_workers 4 \
78 |     --lazy_preprocess True \
79 |     --report_to wandb
80 | 


--------------------------------------------------------------------------------
/scripts/gemma/train/stage_2_full_gemma_v2b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-2B-Pretrain
 3 | FINETUNE_NAME=MGM-2B-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | 
 8 | deepspeed --hostfile hostfile_4 \
 9 |     mgm/train/train_mem.py \
10 |     --deepspeed ./scripts/zero2.json \
11 |     --model_name_or_path model_zoo/LLM/gemma/gemma-2b-it \
12 |     --version gemma \
13 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
14 |     --image_folder ./data/MGM-Finetune \
15 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
16 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
17 |     --image_grid $IMAGE_GRID \
18 |     --image_global $IMAGE_GLOBAL \
19 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
20 |     --mm_projector_type mlp2x_gelu \
21 |     --mm_vision_select_layer -2 \
22 |     --mm_use_im_start_end False \
23 |     --mm_use_im_patch_token False \
24 |     --image_aspect_ratio pad \
25 |     --group_by_modality_length True \
26 |     --image_size_aux $AUX_SIZE \
27 |     --bf16 True \
28 |     --output_dir ./work_dirs/$FINETUNE_NAME \
29 |     --num_train_epochs 1 \
30 |     --per_device_train_batch_size 4 \
31 |     --per_device_eval_batch_size 4 \
32 |     --gradient_accumulation_steps 1 \
33 |     --evaluation_strategy "no" \
34 |     --save_strategy "steps" \
35 |     --save_steps 20000 \
36 |     --save_total_limit 1 \
37 |     --learning_rate 2e-5 \
38 |     --weight_decay 0. \
39 |     --warmup_ratio 0.03 \
40 |     --lr_scheduler_type "cosine" \
41 |     --logging_steps 1 \
42 |     --tf32 True \
43 |     --model_max_length 4096 \
44 |     --gradient_checkpointing True \
45 |     --dataloader_num_workers 4 \
46 |     --lazy_preprocess True \
47 |     --report_to wandb
48 | 


--------------------------------------------------------------------------------
/scripts/llama/eval/math_vista.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-7B-HD"
 9 | OPENAIKEY=""
10 | OPENAIBASE=""
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \
14 |         --model-path work_dirs/$CKPT \
15 |         --question-file data/MGM-Eval/MathVista/testmini.json \
16 |         --image-folder data/MGM-Eval/MathVista \
17 |         --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl
27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MathVista/extract_answer.py \
38 |     --output_file $output_file \
39 |     --api_key $OPENAIKEY \
40 |     --api_base $OPENAIBASE
41 | 
42 | python mgm/eval/MathVista/calculate_score.py \
43 |     --output_file $output_file \
44 |     --score_file $score_file \
45 |     --gt_file data/MGM-Eval/MathVista/testmini.json
46 | 


--------------------------------------------------------------------------------
/scripts/llama/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-7B-HD"
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | CUDA_VISIBLE_DEVICES=1 python -m mgm.eval.model_vqa_mmbench \
 7 |     --model-path ./work_dirs/$CKPT \
 8 |     --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode vicuna_v1 
13 | 
14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment $CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/llama/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-7B-HD"
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \
 6 |     --model-path work_dirs/$CKPT \
 7 |     --question-file data/MGM-Eval/MME/llava_mme.jsonl \
 8 |     --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | cd data/MGM-Eval/MME
14 | 
15 | python convert_answer_to_mme.py --experiment $CKPT
16 | 
17 | cd eval_tool
18 | 
19 | python calculation.py --results_dir answers/$CKPT
20 | 


--------------------------------------------------------------------------------
/scripts/llama/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-7B-HD"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "validation" \
21 |         --conv-mode vicuna_v1 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json


--------------------------------------------------------------------------------
/scripts/llama/eval/mmmu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-7B-HD"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "test" \
21 |         --conv-mode vicuna_v1 & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | 
37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json


--------------------------------------------------------------------------------
/scripts/llama/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-7B-HD"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \
12 |     --model-path work_dirs/$CKPT \
13 |     --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \
14 |     --image-folder data/MGM-Eval/mm-vet/images \
15 |     --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |     --num-chunks $CHUNKS \
17 |     --chunk-idx $IDX \
18 |     --temperature 0 \
19 |     --conv-mode vicuna_v1 &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl
25 | # Clear out the output file if it exists.
26 | > "$output_file"
27 | 
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT
34 | 
35 | python scripts/convert_mmvet_for_eval.py \
36 |     --src $output_file \
37 |     --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/llama/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 |  
 9 | CKPT="MGM/MGM-7B-HD"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \
13 |         --model-path ./work_dirs/$CKPT \
14 |         --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
15 |         --image-folder ./data/MGM-Eval/textvqa/train_images \
16 |         --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python -m mgm.eval.eval_textvqa \
36 |     --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \
37 |     --result-file $output_file
38 | 


--------------------------------------------------------------------------------
/scripts/llama/train/stage_1_2_full_v13b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-13B-Pretrain
 3 | FINETUNE_NAME=MGM-13B
 4 | AUX_SIZE=768
 5 | 
 6 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
 7 | 
 8 | deepspeed --hostfile hostfile_4 \
 9 |     mgm/train/train_mem.py \
10 |     --deepspeed ./scripts/zero2.json \
11 |     --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \
12 |     --version plain \
13 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
14 |     --image_folder ./data/MGM-Pretrain \
15 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
16 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
17 |     --mm_projector_type mlp2x_gelu \
18 |     --tune_mm_mlp_adapter True \
19 |     --mm_vision_select_layer -2 \
20 |     --mm_use_im_start_end False \
21 |     --mm_use_im_patch_token False \
22 |     --image_size_aux $AUX_SIZE \
23 |     --bf16 True \
24 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 8 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 1 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 
45 | 
46 | deepspeed --hostfile hostfile_4 \
47 |     mgm/train/train_mem.py \
48 |     --deepspeed ./scripts/zero2.json \
49 |     --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \
50 |     --version v1 \
51 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
52 |     --image_folder ./data/MGM-Finetune \
53 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
54 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
55 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
56 |     --mm_projector_type mlp2x_gelu \
57 |     --mm_vision_select_layer -2 \
58 |     --mm_use_im_start_end False \
59 |     --mm_use_im_patch_token False \
60 |     --image_aspect_ratio pad \
61 |     --group_by_modality_length True \
62 |     --image_size_aux $AUX_SIZE \
63 |     --bf16 True \
64 |     --output_dir ./work_dirs/$FINETUNE_NAME \
65 |     --num_train_epochs 1 \
66 |     --per_device_train_batch_size 4 \
67 |     --per_device_eval_batch_size 4 \
68 |     --gradient_accumulation_steps 1 \
69 |     --evaluation_strategy "no" \
70 |     --save_strategy "steps" \
71 |     --save_steps 1000 \
72 |     --save_total_limit 1 \
73 |     --learning_rate 2e-5 \
74 |     --weight_decay 0. \
75 |     --warmup_ratio 0.03 \
76 |     --lr_scheduler_type "cosine" \
77 |     --logging_steps 1 \
78 |     --tf32 True \
79 |     --model_max_length 2048 \
80 |     --gradient_checkpointing True \
81 |     --dataloader_num_workers 4 \
82 |     --lazy_preprocess True \
83 |     --report_to wandb
84 | 


--------------------------------------------------------------------------------
/scripts/llama/train/stage_1_2_full_v7b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-7B-Pretrain
 3 | FINETUNE_NAME=MGM-7B
 4 | AUX_SIZE=768
 5 | 
 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine
 7 | 
 8 | deepspeed --hostfile hostfile \
 9 |     mgm/train/train_mem.py \
10 |     --deepspeed ./scripts/zero2_offload.json \
11 |     --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \
12 |     --version plain \
13 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
14 |     --image_folder ./data/MGM-Pretrain \
15 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
16 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
17 |     --mm_projector_type mlp2x_gelu \
18 |     --tune_mm_mlp_adapter True \
19 |     --mm_vision_select_layer -2 \
20 |     --mm_use_im_start_end False \
21 |     --mm_use_im_patch_token False \
22 |     --image_size_aux $AUX_SIZE \
23 |     --bf16 True \
24 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 16 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 1 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 
45 | 
46 | deepspeed --hostfile hostfile \
47 |     mgm/train/train_mem.py \
48 |     --deepspeed ./scripts/zero2_offload.json \
49 |     --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \
50 |     --version v1 \
51 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
52 |     --image_folder ./data/MGM-Finetune \
53 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
54 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
55 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
56 |     --mm_projector_type mlp2x_gelu \
57 |     --mm_vision_select_layer -2 \
58 |     --mm_use_im_start_end False \
59 |     --mm_use_im_patch_token False \
60 |     --image_aspect_ratio pad \
61 |     --group_by_modality_length True \
62 |     --image_size_aux $AUX_SIZE \
63 |     --bf16 True \
64 |     --output_dir ./work_dirs/$FINETUNE_NAME \
65 |     --num_train_epochs 1 \
66 |     --per_device_train_batch_size 8 \
67 |     --per_device_eval_batch_size 4 \
68 |     --gradient_accumulation_steps 1 \
69 |     --evaluation_strategy "no" \
70 |     --save_strategy "steps" \
71 |     --save_steps 1000 \
72 |     --save_total_limit 1 \
73 |     --learning_rate 2e-5 \
74 |     --weight_decay 0. \
75 |     --warmup_ratio 0.03 \
76 |     --lr_scheduler_type "cosine" \
77 |     --logging_steps 1 \
78 |     --tf32 True \
79 |     --model_max_length 2048 \
80 |     --gradient_checkpointing True \
81 |     --dataloader_num_workers 4 \
82 |     --lazy_preprocess True \
83 |     --report_to wandb
84 | 


--------------------------------------------------------------------------------
/scripts/llama/train/stage_1_2_full_v7b_336_hr_768_nodp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-7B-Pretrain
 3 | FINETUNE_NAME=MGM-7B
 4 | AUX_SIZE=768
 5 | DROP_PATH=False
 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine
 7 | 
 8 | deepspeed --hostfile hostfile \
 9 |     mgm/train/train_mem.py \
10 |     --deepspeed ./scripts/zero2_offload.json \
11 |     --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \
12 |     --version plain \
13 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
14 |     --image_folder ./data/MGM-Pretrain \
15 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
16 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
17 |     --mm_projector_type mlp2x_gelu \
18 |     --tune_mm_mlp_adapter True \
19 |     --mm_vision_select_layer -2 \
20 |     --mm_use_im_start_end False \
21 |     --mm_use_im_patch_token False \
22 |     --image_size_aux $AUX_SIZE \
23 |     --drop_path $DROP_PATH \
24 |     --bf16 True \
25 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 16 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 1 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb
45 | 
46 | 
47 | deepspeed --hostfile hostfile \
48 |     mgm/train/train_mem.py \
49 |     --deepspeed ./scripts/zero2_offload.json \
50 |     --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \
51 |     --version v1 \
52 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
53 |     --image_folder ./data/MGM-Finetune \
54 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
55 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
56 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
57 |     --mm_projector_type mlp2x_gelu \
58 |     --mm_vision_select_layer -2 \
59 |     --mm_use_im_start_end False \
60 |     --mm_use_im_patch_token False \
61 |     --image_aspect_ratio pad \
62 |     --group_by_modality_length True \
63 |     --image_size_aux $AUX_SIZE \
64 |     --drop_path $DROP_PATH \
65 |     --bf16 True \
66 |     --output_dir ./work_dirs/$FINETUNE_NAME \
67 |     --num_train_epochs 1 \
68 |     --per_device_train_batch_size 8 \
69 |     --per_device_eval_batch_size 4 \
70 |     --gradient_accumulation_steps 1 \
71 |     --evaluation_strategy "no" \
72 |     --save_strategy "steps" \
73 |     --save_steps 1000 \
74 |     --save_total_limit 1 \
75 |     --learning_rate 2e-5 \
76 |     --weight_decay 0. \
77 |     --warmup_ratio 0.03 \
78 |     --lr_scheduler_type "cosine" \
79 |     --logging_steps 1 \
80 |     --tf32 True \
81 |     --model_max_length 2048 \
82 |     --gradient_checkpointing True \
83 |     --dataloader_num_workers 4 \
84 |     --lazy_preprocess True \
85 |     --report_to wandb
86 | 


--------------------------------------------------------------------------------
/scripts/llama/train/stage_2_full_v13b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-13B-Pretrain
 3 | FINETUNE_NAME=MGM-13B-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | 
 8 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
 9 | 
10 | deepspeed --hostfile hostfile_4 \
11 |     mgm/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path model_zoo/LLM/vicuna/13B-V1.5 \
14 |     --version v1 \
15 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
16 |     --image_folder ./data/MGM-Finetune \
17 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
18 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
19 |     --image_grid $IMAGE_GRID \
20 |     --image_global $IMAGE_GLOBAL \
21 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --image_size_aux $AUX_SIZE \
29 |     --bf16 True \
30 |     --output_dir ./work_dirs/$FINETUNE_NAME \
31 |     --num_train_epochs 1 \
32 |     --per_device_train_batch_size 4 \
33 |     --per_device_eval_batch_size 4 \
34 |     --gradient_accumulation_steps 1 \
35 |     --evaluation_strategy "no" \
36 |     --save_strategy "steps" \
37 |     --save_steps 1000 \
38 |     --save_total_limit 1 \
39 |     --learning_rate 2e-5 \
40 |     --weight_decay 0. \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --model_max_length 4096 \
46 |     --gradient_checkpointing True \
47 |     --dataloader_num_workers 4 \
48 |     --lazy_preprocess True \
49 |     --report_to wandb
50 | 


--------------------------------------------------------------------------------
/scripts/llama/train/stage_2_full_v7b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-7B-Pretrain
 3 | FINETUNE_NAME=MGM-7B-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | 
 8 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
 9 | 
10 | deepspeed --hostfile hostfile_4 \
11 |     mgm/train/train_mem.py \
12 |     --deepspeed ./scripts/zero2.json \
13 |     --model_name_or_path model_zoo/LLM/vicuna/7B-V1.5 \
14 |     --version v1 \
15 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
16 |     --image_folder ./data/MGM-Finetune \
17 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
18 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
19 |     --image_grid $IMAGE_GRID \
20 |     --image_global $IMAGE_GLOBAL \
21 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --image_size_aux $AUX_SIZE \
29 |     --bf16 True \
30 |     --output_dir ./work_dirs/$FINETUNE_NAME \
31 |     --num_train_epochs 1 \
32 |     --per_device_train_batch_size 4 \
33 |     --per_device_eval_batch_size 4 \
34 |     --gradient_accumulation_steps 1 \
35 |     --evaluation_strategy "no" \
36 |     --save_strategy "steps" \
37 |     --save_steps 1000 \
38 |     --save_total_limit 1 \
39 |     --learning_rate 2e-5 \
40 |     --weight_decay 0. \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --model_max_length 4096 \
46 |     --gradient_checkpointing True \
47 |     --dataloader_num_workers 4 \
48 |     --lazy_preprocess True \
49 |     --report_to wandb
50 | 


--------------------------------------------------------------------------------
/scripts/llama3/eval/math_vista.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM-8B-LLaMA-3-HD"
 9 | OPENAIKEY=""
10 | OPENAIBASE=""
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \
14 |         --model-path work_dirs/$CKPT \
15 |         --question-file data/MGM-Eval/MathVista/testmini.json \
16 |         --image-folder data/MGM-Eval/MathVista \
17 |         --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode llama_3 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl
27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MathVista/extract_answer.py \
38 |     --output_file $output_file \
39 |     --api_key $OPENAIKEY \
40 |     --api_base $OPENAIBASE
41 | 
42 | python mgm/eval/MathVista/calculate_score.py \
43 |     --output_file $output_file \
44 |     --score_file $score_file \
45 |     --gt_file data/MGM-Eval/MathVista/testmini.json
46 | 


--------------------------------------------------------------------------------
/scripts/llama3/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM-8B-LLaMA-3-HD"
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | CUDA_VISIBLE_DEVICES=1 python -m mgm.eval.model_vqa_mmbench \
 7 |     --model-path ./work_dirs/$CKPT \
 8 |     --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode llama_3 
13 | 
14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment $CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/llama3/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM-8B-LLaMA-3-HD"
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \
 6 |     --model-path work_dirs/$CKPT \
 7 |     --question-file data/MGM-Eval/MME/llava_mme.jsonl \
 8 |     --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode llama_3
12 | 
13 | cd data/MGM-Eval/MME
14 | 
15 | python convert_answer_to_mme.py --experiment $CKPT
16 | 
17 | cd eval_tool
18 | 
19 | python calculation.py --results_dir answers/$CKPT
20 | 


--------------------------------------------------------------------------------
/scripts/llama3/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM-8B-LLaMA-3-HD"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "validation" \
21 |         --conv-mode llama_3 &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json


--------------------------------------------------------------------------------
/scripts/llama3/eval/mmmu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM-8B-LLaMA-3-HD"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "test" \
21 |         --conv-mode llama_3 & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | 
37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json


--------------------------------------------------------------------------------
/scripts/llama3/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM-8B-LLaMA-3-HD"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \
12 |     --model-path work_dirs/$CKPT \
13 |     --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \
14 |     --image-folder data/MGM-Eval/mm-vet/images \
15 |     --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |     --num-chunks $CHUNKS \
17 |     --chunk-idx $IDX \
18 |     --temperature 0 \
19 |     --conv-mode llama_3 &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl
25 | # Clear out the output file if it exists.
26 | > "$output_file"
27 | 
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT
34 | 
35 | python scripts/convert_mmvet_for_eval.py \
36 |     --src $output_file \
37 |     --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/llama3/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 |  
 9 | CKPT="MGM-8B-LLaMA-3-HD"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \
13 |         --model-path ./work_dirs/$CKPT \
14 |         --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
15 |         --image-folder ./data/MGM-Eval/textvqa/train_images \
16 |         --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode llama_3 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python -m mgm.eval.eval_textvqa \
36 |     --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \
37 |     --result-file $output_file
38 | 


--------------------------------------------------------------------------------
/scripts/llama3/train/stage_1_2_full_v8b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-8B-LLaMA-3-Pretrain
 3 | FINETUNE_NAME=MGM-8B-LLaMA-3
 4 | AUX_SIZE=768
 5 | 
 6 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine
 7 | 
 8 | deepspeed --hostfile ../hostfile \
 9 |     mgm/train/train_mem.py \
10 |     --deepspeed ./scripts/zero2_offload.json \
11 |     --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \
12 |     --version llama_3 \
13 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
14 |     --image_folder ./data/MGM-Pretrain \
15 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
16 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
17 |     --mm_projector_type mlp2x_gelu \
18 |     --tune_mm_mlp_adapter True \
19 |     --mm_vision_select_layer -2 \
20 |     --mm_use_im_start_end False \
21 |     --mm_use_im_patch_token False \
22 |     --image_size_aux $AUX_SIZE \
23 |     --bf16 True \
24 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
25 |     --num_train_epochs 1 \
26 |     --per_device_train_batch_size 16 \
27 |     --per_device_eval_batch_size 4 \
28 |     --gradient_accumulation_steps 1 \
29 |     --evaluation_strategy "no" \
30 |     --save_strategy "steps" \
31 |     --save_steps 24000 \
32 |     --save_total_limit 1 \
33 |     --learning_rate 1e-3 \
34 |     --weight_decay 0. \
35 |     --warmup_ratio 0.03 \
36 |     --lr_scheduler_type "cosine" \
37 |     --logging_steps 1 \
38 |     --tf32 True \
39 |     --model_max_length 2048 \
40 |     --gradient_checkpointing True \
41 |     --dataloader_num_workers 4 \
42 |     --lazy_preprocess True \
43 |     --report_to wandb
44 | 
45 | 
46 | deepspeed --hostfile ../hostfile \
47 |     mgm/train/train_mem.py \
48 |     --deepspeed ./scripts/zero2_offload.json \
49 |     --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \
50 |     --version llama_3 \
51 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
52 |     --image_folder ./data/MGM-Finetune \
53 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
54 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
55 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
56 |     --mm_projector_type mlp2x_gelu \
57 |     --mm_vision_select_layer -2 \
58 |     --mm_use_im_start_end False \
59 |     --mm_use_im_patch_token False \
60 |     --image_aspect_ratio pad \
61 |     --group_by_modality_length True \
62 |     --image_size_aux $AUX_SIZE \
63 |     --bf16 True \
64 |     --output_dir ./work_dirs/$FINETUNE_NAME \
65 |     --num_train_epochs 1 \
66 |     --per_device_train_batch_size 8 \
67 |     --per_device_eval_batch_size 4 \
68 |     --gradient_accumulation_steps 1 \
69 |     --evaluation_strategy "no" \
70 |     --save_strategy "steps" \
71 |     --save_steps 1000 \
72 |     --save_total_limit 1 \
73 |     --learning_rate 2e-5 \
74 |     --weight_decay 0. \
75 |     --warmup_ratio 0.03 \
76 |     --lr_scheduler_type "cosine" \
77 |     --logging_steps 1 \
78 |     --tf32 True \
79 |     --model_max_length 2048 \
80 |     --gradient_checkpointing True \
81 |     --dataloader_num_workers 4 \
82 |     --lazy_preprocess True \
83 |     --report_to wandb
84 | 


--------------------------------------------------------------------------------
/scripts/llama3/train/stage_2_full_v8b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-8B-LLaMA-3-Pretrain
 3 | FINETUNE_NAME=MGM-8B-LLaMA-3-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | 
 8 | # delete --hostfile hostfile and change --per_device_train_batch_size if trained on single machine
 9 | 
10 | deepspeed --hostfile ../hostfile \
11 |     mgm/train/train_mem.py \
12 |     --deepspeed ./scripts/zero3.json \
13 |     --model_name_or_path model_zoo/LLM/llama-3/Meta-Llama-3-8B-Instruct \
14 |     --version llama_3 \
15 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
16 |     --image_folder ./data/MGM-Finetune \
17 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
18 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
19 |     --image_grid $IMAGE_GRID \
20 |     --image_global $IMAGE_GLOBAL \
21 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
22 |     --mm_projector_type mlp2x_gelu \
23 |     --mm_vision_select_layer -2 \
24 |     --mm_use_im_start_end False \
25 |     --mm_use_im_patch_token False \
26 |     --image_aspect_ratio pad \
27 |     --group_by_modality_length True \
28 |     --image_size_aux $AUX_SIZE \
29 |     --bf16 True \
30 |     --output_dir ./work_dirs/$FINETUNE_NAME \
31 |     --num_train_epochs 1 \
32 |     --per_device_train_batch_size 4 \
33 |     --per_device_eval_batch_size 4 \
34 |     --gradient_accumulation_steps 2 \
35 |     --evaluation_strategy "no" \
36 |     --save_strategy "steps" \
37 |     --save_steps 1000 \
38 |     --save_total_limit 1 \
39 |     --learning_rate 2e-5 \
40 |     --weight_decay 0. \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --model_max_length 4096 \
46 |     --gradient_checkpointing True \
47 |     --dataloader_num_workers 4 \
48 |     --lazy_preprocess True \
49 |     --report_to wandb
50 | 


--------------------------------------------------------------------------------
/scripts/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mgm.model.builder import load_pretrained_model
 3 | from mgm.mm_utils import get_model_name_from_path
 4 | 
 5 | 
 6 | def merge_lora(args):
 7 |     model_name = get_model_name_from_path(args.model_path)
 8 |     tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map='cpu')
 9 | 
10 |     model.save_pretrained(args.save_model_path)
11 |     tokenizer.save_pretrained(args.save_model_path)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--model-path", type=str, required=True)
17 |     parser.add_argument("--model-base", type=str, required=True)
18 |     parser.add_argument("--save-model-path", type=str, required=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)
23 | 


--------------------------------------------------------------------------------
/scripts/mixtral/eval/math_vista.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-8x7B"
 9 | OPENAIKEY=""
10 | OPENAIBASE=""
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \
14 |         --model-path work_dirs/$CKPT \
15 |         --question-file data/MGM-Eval/MathVista/testmini.json \
16 |         --image-folder data/MGM-Eval/MathVista \
17 |         --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode mistral_instruct &  #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl
27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MathVista/extract_answer.py \
38 |     --output_file $output_file \
39 |     --api_key $OPENAIKEY \
40 |     --api_base $OPENAIBASE
41 | 
42 | python mgm/eval/MathVista/calculate_score.py \
43 |     --output_file $output_file \
44 |     --score_file $score_file \
45 |     --gt_file data/MGM-Eval/MathVista/testmini.json
46 | 


--------------------------------------------------------------------------------
/scripts/mixtral/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-8x7B"
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | python -m mgm.eval.model_vqa_mmbench \
 7 |     --model-path ./work_dirs/$CKPT \
 8 |     --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode mistral_instruct 
13 | 
14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment $CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/mixtral/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT="MGM/MGM-8x7B"
 3 | 
 4 | python -m mgm.eval.model_vqa_loader \
 5 |     --model-path work_dirs/$CKPT \
 6 |     --question-file data/MGM-Eval/MME/llava_mme.jsonl \
 7 |     --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \
 8 |     --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \
 9 |     --temperature 0 \
10 |     --conv-mode mistral_instruct
11 | 
12 | 
13 | cd data/MGM-Eval/MME
14 | 
15 | python convert_answer_to_mme.py --experiment $CKPT
16 | 
17 | cd eval_tool
18 | 
19 | python calculation.py --results_dir answers/$CKPT
20 | 


--------------------------------------------------------------------------------
/scripts/mixtral/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-8x7B"
 4 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
 5 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
 6 | 
 7 | # Clear out the output file if it exists.
 8 | > "$output_file"
 9 | 
10 | python mgm/eval/MMMU/eval/run_llava.py \
11 |         --data_path ./data/MGM-Eval/MMMU \
12 |         --config_path $CONFIG \
13 |         --model_path ./work_dirs/$CKPT \
14 |         --answers-file $output_file \
15 |         --split "validation" \
16 |         --conv-mode mistral_instruct
17 | 
18 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json


--------------------------------------------------------------------------------
/scripts/mixtral/eval/mmmu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-8x7B"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "test" \
21 |         --load_8bit True \
22 |         --conv-mode mistral_instruct &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json


--------------------------------------------------------------------------------
/scripts/mixtral/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-8x7B"
 9 | 
10 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl
11 | # Clear out the output file if it exists.
12 | > "$output_file"
13 | 
14 | python -m mgm.eval.model_vqa \
15 |     --model-path work_dirs/$CKPT \
16 |     --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \
17 |     --image-folder data/MGM-Eval/mm-vet/images \
18 |     --answers-file $output_file \
19 |     --temperature 0 \
20 |     --conv-mode mistral_instruct
21 | 
22 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT
23 | 
24 | python scripts/convert_mmvet_for_eval.py \
25 |     --src $output_file \
26 |     --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/mixtral/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-8x7B"
 4 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl
 5 | # Clear out the output file if it exists.
 6 | > "$output_file"
 7 |  
 8 | python -m mgm.eval.model_vqa_loader \
 9 |         --model-path ./work_dirs/$CKPT \
10 |         --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
11 |         --image-folder ./data/MGM-Eval/textvqa/train_images \
12 |         --answers-file $output_file \
13 |         --temperature 0 \
14 |         --conv-mode mistral_instruct 
15 | 
16 | python -m mgm.eval.eval_textvqa \
17 |     --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \
18 |     --result-file $output_file
19 | 


--------------------------------------------------------------------------------
/scripts/mixtral/train/stage_1_2_full_mixtral_8x7b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-8x7B-Pretrain
 3 | FINETUNE_NAME=MGM-8x7B
 4 | AUX_SIZE=768
 5 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2"
 6 | 
 7 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
 8 | 
 9 | deepspeed --hostfile hostfile_4 \
10 |     mgm/train/train_mem.py \
11 |     --deepspeed ./scripts/zero3.json \
12 |     --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \
13 |     --version plain \
14 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
15 |     --image_folder ./data/MGM-Pretrain \
16 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
17 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --image_size_aux $AUX_SIZE \
24 |     --bf16 True \
25 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 1 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb
45 | 
46 | 
47 | deepspeed --hostfile hostfile_4 \
48 |     mgm/train/train_mem.py \
49 |     --deepspeed ./scripts/zero3.json \
50 |     --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \
51 |     --version mistral_instruct \
52 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
53 |     --image_folder ./data/MGM-Finetune \
54 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
55 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
56 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
57 |     --mm_projector_type mlp2x_gelu \
58 |     --mm_vision_select_layer -2 \
59 |     --mm_use_im_start_end False \
60 |     --mm_use_im_patch_token False \
61 |     --image_aspect_ratio pad \
62 |     --group_by_modality_length True \
63 |     --image_size_aux $AUX_SIZE \
64 |     --bf16 True \
65 |     --output_dir ./work_dirs/$FINETUNE_NAME \
66 |     --num_train_epochs 1 \
67 |     --per_device_train_batch_size 4 \
68 |     --per_device_eval_batch_size 4 \
69 |     --gradient_accumulation_steps 1 \
70 |     --evaluation_strategy "no" \
71 |     --save_strategy "steps" \
72 |     --save_steps 1000 \
73 |     --save_total_limit 1 \
74 |     --learning_rate 1e-5 \
75 |     --lr_multi $LR_MULTI \
76 |     --weight_decay 0. \
77 |     --warmup_ratio 0.03 \
78 |     --lr_scheduler_type "cosine" \
79 |     --logging_steps 1 \
80 |     --tf32 True \
81 |     --model_max_length 2048 \
82 |     --gradient_checkpointing True \
83 |     --dataloader_num_workers 4 \
84 |     --lazy_preprocess True \
85 |     --report_to wandb
86 | 


--------------------------------------------------------------------------------
/scripts/mixtral/train/stage_2_full_mixtral_8x7b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-8x7B-Pretrain
 3 | FINETUNE_NAME=MGM-8x7B-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2"
 8 | 
 9 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
10 | 
11 | deepspeed --hostfile hostfile_4 \
12 |     mgm/train/train_mem.py \
13 |     --deepspeed ./scripts/zero3.json \
14 |     --model_name_or_path model_zoo/LLM/mixtral/Mixtral-8x7B-Instruct-v0.1 \
15 |     --version mistral_instruct \
16 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
17 |     --image_folder ./data/MGM-Finetune \
18 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
19 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
20 |     --image_grid $IMAGE_GRID \
21 |     --image_global $IMAGE_GLOBAL \
22 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
23 |     --mm_projector_type mlp2x_gelu \
24 |     --mm_vision_select_layer -2 \
25 |     --mm_use_im_start_end False \
26 |     --mm_use_im_patch_token False \
27 |     --image_aspect_ratio pad \
28 |     --group_by_modality_length True \
29 |     --image_size_aux $AUX_SIZE \
30 |     --bf16 True \
31 |     --output_dir ./work_dirs/$FINETUNE_NAME \
32 |     --num_train_epochs 1 \
33 |     --per_device_train_batch_size 4 \
34 |     --per_device_eval_batch_size 4 \
35 |     --gradient_accumulation_steps 1 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 1000 \
39 |     --save_total_limit 1 \
40 |     --learning_rate 1e-5 \
41 |     --lr_multi $LR_MULTI \
42 |     --weight_decay 0. \
43 |     --warmup_ratio 0.03 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --tf32 True \
47 |     --model_max_length 4096 \
48 |     --gradient_checkpointing True \
49 |     --dataloader_num_workers 4 \
50 |     --lazy_preprocess True \
51 |     --report_to wandb
52 | 


--------------------------------------------------------------------------------
/scripts/yi/eval/math_vista.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 5 | 
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-34B"
 9 | OPENAIKEY=""
10 | OPENAIBASE=""
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_math_vista \
14 |         --model-path work_dirs/$CKPT \
15 |         --question-file data/MGM-Eval/MathVista/testmini.json \
16 |         --image-folder data/MGM-Eval/MathVista \
17 |         --answers-file data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --temperature 0 \
21 |         --conv-mode chatml_direct &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./data/MGM-Eval/MathVista/answers/$CKPT/merge.jsonl
27 | score_file=./data/MGM-Eval/MathVista/answers/$CKPT/score.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./data/MGM-Eval/MathVista/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python mgm/eval/MathVista/extract_answer.py \
38 |     --output_file $output_file \
39 |     --api_key $OPENAIKEY \
40 |     --api_base $OPENAIBASE
41 | 
42 | python mgm/eval/MathVista/calculate_score.py \
43 |     --output_file $output_file \
44 |     --score_file $score_file \
45 |     --gt_file data/MGM-Eval/MathVista/testmini.json
46 | 


--------------------------------------------------------------------------------
/scripts/yi/eval/mmbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-34B"
 4 | SPLIT="mmbench_dev_20230712"
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_mmbench \
 7 |     --model-path ./work_dirs/$CKPT \
 8 |     --question-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
 9 |     --answers-file ./data/MGM-Eval/mmbench/answers/$SPLIT/$CKPT.jsonl \
10 |     --single-pred-prompt \
11 |     --temperature 0 \
12 |     --conv-mode chatml_direct 
13 | 
14 | mkdir -p ./data/MGM-Eval/mmbench/answers_upload/$SPLIT
15 | 
16 | python scripts/convert_mmbench_for_submission.py \
17 |     --annotation-file ./data/MGM-Eval/mmbench/$SPLIT.tsv \
18 |     --result-dir ./data/MGM-Eval/mmbench/answers/$SPLIT \
19 |     --upload-dir ./data/MGM-Eval/mmbench/answers_upload/$SPLIT \
20 |     --experiment $CKPT
21 | 


--------------------------------------------------------------------------------
/scripts/yi/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CKPT="MGM/MGM-34B"
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0 python -m mgm.eval.model_vqa_loader \
 6 |     --model-path work_dirs/$CKPT \
 7 |     --question-file data/MGM-Eval/MME/llava_mme.jsonl \
 8 |     --image-folder data/MGM-Eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file data/MGM-Eval/MME/answers/$CKPT.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode chatml_direct
12 | 
13 | cd data/MGM-Eval/MME
14 | 
15 | python convert_answer_to_mme.py --experiment $CKPT
16 | 
17 | cd eval_tool
18 | 
19 | python calculation.py --results_dir answers/$CKPT
20 | 


--------------------------------------------------------------------------------
/scripts/yi/eval/mmmu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-34B"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "validation" \
21 |         --conv-mode chatml_direct & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python mgm/eval/MMMU/eval/eval.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/val.json


--------------------------------------------------------------------------------
/scripts/yi/eval/mmmu_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 | 
 9 | CKPT="MGM/MGM-34B"
10 | CONFIG="mgm/eval/MMMU/eval/configs/llava1.5.yaml"
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python mgm/eval/MMMU/eval/run_llava.py \
14 |         --data_path ./data/MGM-Eval/MMMU \
15 |         --config_path $CONFIG \
16 |         --model_path ./work_dirs/$CKPT \
17 |         --answers-file ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --split "test" \
21 |         --conv-mode chatml_direct & #--load_8bit True \ use this if you want to load 8-bit model
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/MMMU/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/MMMU/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python mgm/eval/MMMU/eval/convert_to_test.py --result_file $output_file --output_path ./work_dirs/MMMU/$CKPT/test.json


--------------------------------------------------------------------------------
/scripts/yi/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | CHUNKS=${#GPULIST[@]}
 7 | 
 8 | CKPT="MGM/MGM-34B"
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |   CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa \
12 |     --model-path work_dirs/$CKPT \
13 |     --question-file data/MGM-Eval/mm-vet/llava-mm-vet.jsonl \
14 |     --image-folder data/MGM-Eval/mm-vet/images \
15 |     --answers-file data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 |     --num-chunks $CHUNKS \
17 |     --chunk-idx $IDX \
18 |     --temperature 0 \
19 |     --conv-mode chatml_direct &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=data/MGM-Eval/mm-vet/answers/$CKPT/merge.jsonl
25 | # Clear out the output file if it exists.
26 | > "$output_file"
27 | 
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat data/MGM-Eval/mm-vet/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | mkdir -p data/MGM-Eval/mm-vet/results/$CKPT
34 | 
35 | python scripts/convert_mmvet_for_eval.py \
36 |     --src $output_file \
37 |     --dst data/MGM-Eval/mm-vet/results/$CKPT/$CKPT.json
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/yi/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 6 | 
 7 | CHUNKS=${#GPULIST[@]}
 8 |  
 9 | CKPT="MGM/MGM-34B"
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m mgm.eval.model_vqa_loader \
13 |         --model-path ./work_dirs/$CKPT \
14 |         --question-file ./data/MGM-Eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
15 |         --image-folder ./data/MGM-Eval/textvqa/train_images \
16 |         --answers-file ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --load_8bit True \
21 |         --conv-mode chatml_direct &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./work_dirs/textvqa/answers/$CKPT/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./work_dirs/textvqa/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python -m mgm.eval.eval_textvqa \
37 |     --annotation-file ./data/MGM-Eval/textvqa/TextVQA_0.5.1_val.json \
38 |     --result-file $output_file
39 | 


--------------------------------------------------------------------------------
/scripts/yi/train/stage_1_2_full_yi34b_336_hr_768.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-34B-Pretrain
 3 | FINETUNE_NAME=MGM-34B
 4 | AUX_SIZE=768
 5 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2"
 6 | 
 7 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
 8 | 
 9 | deepspeed --hostfile hostfile_4 \
10 |     mgm/train/train_mem.py \
11 |     --deepspeed ./scripts/zero3.json \
12 |     --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \
13 |     --version plain \
14 |     --data_path ./data/MGM-Pretrain/mgm_pretrain.json \
15 |     --image_folder ./data/MGM-Pretrain \
16 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
17 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
18 |     --mm_projector_type mlp2x_gelu \
19 |     --tune_mm_mlp_adapter True \
20 |     --mm_vision_select_layer -2 \
21 |     --mm_use_im_start_end False \
22 |     --mm_use_im_patch_token False \
23 |     --image_size_aux $AUX_SIZE \
24 |     --bf16 True \
25 |     --output_dir ./work_dirs/$PRETRAIN_NAME \
26 |     --num_train_epochs 1 \
27 |     --per_device_train_batch_size 8 \
28 |     --per_device_eval_batch_size 4 \
29 |     --gradient_accumulation_steps 1 \
30 |     --evaluation_strategy "no" \
31 |     --save_strategy "steps" \
32 |     --save_steps 24000 \
33 |     --save_total_limit 1 \
34 |     --learning_rate 1e-3 \
35 |     --weight_decay 0. \
36 |     --warmup_ratio 0.03 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --model_max_length 2048 \
41 |     --gradient_checkpointing True \
42 |     --dataloader_num_workers 4 \
43 |     --lazy_preprocess True \
44 |     --report_to wandb
45 | 
46 | 
47 | deepspeed --hostfile hostfile_4 \
48 |     mgm/train/train_mem.py \
49 |     --deepspeed ./scripts/zero3.json \
50 |     --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \
51 |     --version chatml_direct \
52 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
53 |     --image_folder ./data/MGM-Finetune \
54 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
55 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
56 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
57 |     --mm_projector_type mlp2x_gelu \
58 |     --mm_vision_select_layer -2 \
59 |     --mm_use_im_start_end False \
60 |     --mm_use_im_patch_token False \
61 |     --image_aspect_ratio pad \
62 |     --group_by_modality_length True \
63 |     --image_size_aux $AUX_SIZE \
64 |     --bf16 True \
65 |     --output_dir ./work_dirs/$FINETUNE_NAME \
66 |     --num_train_epochs 1 \
67 |     --per_device_train_batch_size 4 \
68 |     --per_device_eval_batch_size 4 \
69 |     --gradient_accumulation_steps 1 \
70 |     --evaluation_strategy "no" \
71 |     --save_strategy "steps" \
72 |     --save_steps 1000 \
73 |     --save_total_limit 1 \
74 |     --learning_rate 1e-5 \
75 |     --lr_multi $LR_MULTI \
76 |     --weight_decay 0. \
77 |     --warmup_ratio 0.03 \
78 |     --lr_scheduler_type "cosine" \
79 |     --logging_steps 1 \
80 |     --tf32 True \
81 |     --model_max_length 2048 \
82 |     --gradient_checkpointing True \
83 |     --dataloader_num_workers 4 \
84 |     --lazy_preprocess True \
85 |     --report_to wandb
86 | 


--------------------------------------------------------------------------------
/scripts/yi/train/stage_2_full_yi34b_672_hr_1536.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PRETRAIN_NAME=MGM-34B-Pretrain
 3 | FINETUNE_NAME=MGM-34B-HD
 4 | AUX_SIZE=1536
 5 | IMAGE_GRID=2
 6 | IMAGE_GLOBAL=True
 7 | LR_MULTI="model.mm_projector:2,model.vlm_uni:2"
 8 | 
 9 | # delete --hostfile hostfile_4 and change --per_device_train_batch_size if trained on single machine
10 | 
11 | 
12 | deepspeed --hostfile hostfile_4 \
13 |     mgm/train/train_mem.py \
14 |     --deepspeed ./scripts/zero3.json \
15 |     --model_name_or_path model_zoo/LLM/Nous-Hermes-2-Yi-34B \
16 |     --version chatml_direct \
17 |     --data_path ./data/MGM-Finetune/mgm_instruction.json \
18 |     --image_folder ./data/MGM-Finetune \
19 |     --vision_tower model_zoo/OpenAI/clip-vit-large-patch14-336 \
20 |     --vision_tower_aux model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup \
21 |     --image_grid $IMAGE_GRID \
22 |     --image_global $IMAGE_GLOBAL \
23 |     --pretrain_mm_mlp_adapter ./work_dirs/$PRETRAIN_NAME/mm_projector.bin \
24 |     --mm_projector_type mlp2x_gelu \
25 |     --mm_vision_select_layer -2 \
26 |     --mm_use_im_start_end False \
27 |     --mm_use_im_patch_token False \
28 |     --image_aspect_ratio pad \
29 |     --group_by_modality_length True \
30 |     --image_size_aux $AUX_SIZE \
31 |     --bf16 True \
32 |     --output_dir ./work_dirs/$FINETUNE_NAME \
33 |     --num_train_epochs 1 \
34 |     --per_device_train_batch_size 4 \
35 |     --per_device_eval_batch_size 4 \
36 |     --gradient_accumulation_steps 1 \
37 |     --evaluation_strategy "no" \
38 |     --save_strategy "steps" \
39 |     --save_steps 1000 \
40 |     --save_total_limit 1 \
41 |     --learning_rate 1e-5 \
42 |     --lr_multi $LR_MULTI \
43 |     --weight_decay 0. \
44 |     --warmup_ratio 0.03 \
45 |     --lr_scheduler_type "cosine" \
46 |     --logging_steps 1 \
47 |     --tf32 True \
48 |     --model_max_length 4096 \
49 |     --gradient_checkpointing True \
50 |     --dataloader_num_workers 4 \
51 |     --lazy_preprocess True \
52 |     --report_to wandb
53 | 


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "offload_optimizer": {
19 |             "device": "cpu",
20 |             "pin_memory": true
21 |           },
22 |           "offload_param": {
23 |             "device": "cpu",
24 |             "pin_memory": true
25 |           },
26 |         "overlap_comm": true,
27 |         "contiguous_gradients": true,
28 |         "sub_group_size": 1e9,
29 |         "reduce_bucket_size": "auto"
30 |     }
31 | }


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------