├── .DS_Store
├── EVEv1
    ├── .DS_Store
    ├── README.md
    ├── docs
    │   ├── Data.md
    │   └── Evaluation.md
    ├── eve
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_gpt_review.py
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── eval_gpt_review_visual.py
    │   │   ├── eval_one_sample.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── eval_science_qa_gpt4.py
    │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   ├── eval_textvqa.py
    │   │   ├── m4c_evaluator.py
    │   │   ├── model_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_mmbench.py
    │   │   ├── model_vqa_qbench.py
    │   │   ├── model_vqa_science.py
    │   │   ├── qa_baseline_gpt35.py
    │   │   ├── run_eve.py
    │   │   ├── summarize_gpt_review.py
    │   │   └── table
    │   │   │   └── rule.json
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── eve_arch.py
    │   │   ├── language_model
    │   │   │   └── eve_llama.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   ├── clip_encoder.py
    │   │   │   ├── configuration_evaclip.py
    │   │   │   ├── modeling_evaclip.py
    │   │   │   └── vision_tokenizer.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   ├── train
    │   │   ├── eve_trainer.py
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llama_xformers_attn_monkey_patch.py
    │   │   ├── train.py
    │   │   ├── train_mem.py
    │   │   └── train_xformers.py
    │   └── utils.py
    ├── examples
    │   ├── .DS_Store
    │   ├── beach.png
    │   ├── mac.jpg
    │   └── ocr_beijing.jpg
    ├── images
    │   ├── .DS_Store
    │   ├── eve_logo.png
    │   ├── eve_motivation1.png
    │   ├── eve_motivation2.png
    │   ├── eve_results.jpg
    │   └── eve_structure.png
    ├── pyproject.toml
    ├── scripts
    │   ├── .DS_Store
    │   ├── convert_gqa_for_eval.py
    │   ├── convert_mmbench_for_submission.py
    │   ├── convert_mmvet_for_eval.py
    │   ├── convert_seed_for_submission.py
    │   ├── convert_sqa_to_eve.py
    │   ├── convert_sqa_to_eve_base_prompt.py
    │   ├── convert_vizwiz_for_submission.py
    │   ├── convert_vqav2_for_submission.py
    │   ├── eve
    │   │   ├── .DS_Store
    │   │   ├── eval
    │   │   │   ├── gqa.sh
    │   │   │   ├── llavabench.sh
    │   │   │   ├── mmbench_cn.sh
    │   │   │   ├── mmbench_en.sh
    │   │   │   ├── mme.sh
    │   │   │   ├── mmvet.sh
    │   │   │   ├── pope.sh
    │   │   │   ├── qbench.sh
    │   │   │   ├── seed.sh
    │   │   │   ├── sqa.sh
    │   │   │   ├── textvqa.sh
    │   │   │   ├── vizwiz.sh
    │   │   │   └── vqav2.sh
    │   │   ├── eve7b_finetune.sh
    │   │   ├── eve7b_finetune_hd.sh
    │   │   ├── eve7b_prealign.sh
    │   │   ├── eve7b_pretrain.sh
    │   │   └── test_all_benchmark.sh
    │   ├── zero2.json
    │   ├── zero3.json
    │   └── zero3_offload.json
    └── tools
    │   └── app.py
├── EVEv2
    ├── .DS_Store
    ├── LICENSE
    ├── README.md
    ├── docs
    │   ├── Data.md
    │   └── Evaluation.md
    ├── eve
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_gpt_review.py
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── eval_gpt_review_visual.py
    │   │   ├── eval_one_sample.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── eval_science_qa_gpt4.py
    │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   ├── eval_textvqa.py
    │   │   ├── m4c_evaluator.py
    │   │   ├── model_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_mmbench.py
    │   │   ├── model_vqa_qbench.py
    │   │   ├── model_vqa_science.py
    │   │   ├── qa_baseline_gpt35.py
    │   │   ├── run_eve.py
    │   │   ├── summarize_gpt_review.py
    │   │   └── table
    │   │   │   └── rule.json
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── eve_arch.py
    │   │   ├── language_model
    │   │   │   ├── eve_llama3.py
    │   │   │   ├── eve_qwen2.py
    │   │   │   ├── llama3
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configuration_llama.py
    │   │   │   │   ├── modeling_llama.py
    │   │   │   │   ├── tokenization_llama.py
    │   │   │   │   └── tokenization_llama_fast.py
    │   │   │   └── qwen2
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configuration_qwen2.py
    │   │   │   │   ├── modeling_qwen2.py
    │   │   │   │   ├── tokenization_qwen2.py
    │   │   │   │   └── tokenization_qwen2_fast.py
    │   │   ├── multimodal_encoder
    │   │   │   └── vision_tokenizer.py
    │   │   └── utils.py
    │   ├── train
    │   │   ├── eve_trainer.py
    │   │   ├── merge_moe.py
    │   │   ├── repeat_moe.py
    │   │   ├── replace_with_flash_attn.py
    │   │   ├── train.py
    │   │   └── train_mem.py
    │   └── utils.py
    ├── examples
    │   ├── .DS_Store
    │   ├── MAR.jpg
    │   ├── mac.jpg
    │   └── ocr_beijing.jpg
    ├── images
    │   ├── .DS_Store
    │   ├── eve_logo.png
    │   ├── eve_motivation.jpg
    │   ├── eve_results.jpg
    │   └── eve_structure.jpg
    ├── openai
    │   ├── eve-anyratio-res1600-patch16
    │   │   └── preprocessor_config.json
    │   └── eve-anyratio-res800-patch16
    │   │   └── preprocessor_config.json
    ├── pyproject.toml
    └── scripts
    │   ├── .DS_Store
    │   ├── convert_gqa_for_eval.py
    │   ├── convert_mmbench_for_submission.py
    │   ├── convert_mmvet_for_eval.py
    │   ├── convert_seed_for_submission.py
    │   ├── convert_sqa_to_eve.py
    │   ├── convert_sqa_to_eve_base_prompt.py
    │   ├── convert_vizwiz_for_submission.py
    │   ├── convert_vqav2_for_submission.py
    │   ├── eve
    │       ├── .DS_Store
    │       ├── 0_eve7b_prealign_anyratio_ve.sh
    │       ├── 0_notrain_copy_llm_weight_into_moe.sh
    │       ├── 1.0_eve7b_prealign_anyratio_ve_moe.sh
    │       ├── 1.1_eve7b_prealign_anyratio_ve_moe_hd.sh
    │       ├── 2_eve7b_fullalign_anyratio_hd.sh
    │       ├── 3_eve7b_finetune_anyratio_hd.sh
    │       ├── eval
    │       │   ├── gqa.sh
    │       │   ├── llavabench.sh
    │       │   ├── mmbench_cn.sh
    │       │   ├── mmbench_en.sh
    │       │   ├── mme.sh
    │       │   ├── mmvet.sh
    │       │   ├── pope.sh
    │       │   ├── qbench.sh
    │       │   ├── seed.sh
    │       │   ├── sqa.sh
    │       │   ├── textvqa.sh
    │       │   ├── vizwiz.sh
    │       │   └── vqav2.sh
    │       └── test_all_benchmark.sh
    │   ├── zero2.json
    │   ├── zero3.json
    │   └── zero3_offload.json
├── LICENSE
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/docs/Data.md:
--------------------------------------------------------------------------------
  1 | ## Data Composition
  2 | 
  3 | | Data name | Data size |
  4 | | --- | ---: |
  5 | | EVE_pretrain_cap33M.json | 28 GB |
  6 | | [LLaVA_v1_5_mix665K.json](https://drive.google.com/file/d/1cnuVSRQ6_W80ZBnEYCeCzG9KHMGO3XrG/view?usp=sharing) | 983 MB |
  7 | | [EVE_instruct_mix1.8M.json](https://drive.google.com/file/d/1iGg85xdJhyZv-s1ttCe_SZ-CUk-hThjs/view?usp=sharing) | 2.1 GB |
  8 | 
  9 | ### EVE-PT Dataset
 10 | We introduce publicly available web-scale data, including image-only: SA-1B, OpenImages; and image-text: LAION. We remove noisy text captions and reproduce 33M high-quality descriptions via Emu2 (17B) and LLaVA-1.5 (13B) as EVE-cap33M. **We have no specific plan to release pretraining data.** You can download and filter images according to our paper's guidelines, utilizing [LLaVA-NEXT](https://github.com/LLaVA-VL/LLaVA-NeXT) to generate high-definition image descriptions, which would provide better results.
 11 | 
 12 | #### Prepare PT Images
 13 | 
 14 | Organize the data as follows in `./playground/data/EVE-Pretrain-33M/`:
 15 | 
 16 | ```none
 17 | data
 18 | ├── EVE-Pretrain-33M
 19 | │   │── eve_pretrain_cap33m.json
 20 | │   ├── LAION-Dedump
 21 | │   │   ├── images
 22 | │   │   │   ├── 000000
 23 | │   │   │   ├── 000001
 24 | │   │   │   ├── ...
 25 | │   ├── Openimages_v6
 26 | │   │   ├── images
 27 | │   │   │   ├── V6Train1
 28 | │   │   │   ├── V6Train2
 29 | │   │   │   ├── ...
 30 | │   ├── SAM-11M
 31 | │   │   ├── images
 32 | │   │   │   ├── 000000
 33 | │   │   │   ├── 000001
 34 | │   │   │   ├── ...
 35 | ```
 36 | 
 37 | 
 38 | ### EVE-SFT Dataset
 39 | We utilize LLaVA-v1_5-mix665K as SFT data to obtain the standard version of EVE-7B. Besides, we also attempt to enlarge the limitation of maximum resolution only in the SFT stage. To bridge the resolution gap between pre-training and fine-tuning stages, we further involve 1.2M SFT conversation data, including AI2D, Synthdog, DVQA, ChartQA, DocVQA, Vision-Flan, and Bunny-695K to obtain high-resolution version of EVE-7B-HD.
 40 | 
 41 | #### Prepare SFT Images
 42 | 
 43 | Please download the annotation of the final mixture SFT data: [llava_v1_5_mix665k.json](https://drive.google.com/file/d/1cnuVSRQ6_W80ZBnEYCeCzG9KHMGO3XrG/view?usp=sharing) and [eve_instruct_mix1.8m.json](https://drive.google.com/file/d/1iGg85xdJhyZv-s1ttCe_SZ-CUk-hThjs/view?usp=sharing); Then download the images from constituting datasets:
 44 | 
 45 | - COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip)
 46 | - GQA: [images](https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip)
 47 | - OCR-VQA: [download script](https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing). We save all files as `.jpg`
 48 | - TextVQA: [train_val_images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip)
 49 | - VisualGenome: [images](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip), [images2](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip)
 50 | - AI2D: [ai2d](https://huggingface.co/datasets/lmms-lab/ai2d)
 51 | - Synthdog: [synthdog-en](https://huggingface.co/datasets/naver-clova-ix/synthdog-en)
 52 | - DVQA: [DVQA](https://huggingface.co/datasets/skywalkerzhang19/DVQA)
 53 | - ChartQA: [ChartQA](https://huggingface.co/datasets/lmms-lab/ChartQA)
 54 | - DocVQA: [DocVQA](https://huggingface.co/datasets/lmms-lab/DocVQA)
 55 | - Open_images: [Bunny-v1_0-data](https://huggingface.co/datasets/BoyaWu10/Bunny-v1_0-data)
 56 | - Vision-Flan: [vision-flan_191-task_1k](https://huggingface.co/datasets/Vision-Flan/vision-flan_191-task_1k)
 57 | 
 58 | Then, organize the data as follows in `./playground/data/EVE-Finetune/`:
 59 | 
 60 | ```none
 61 | data
 62 | ├── EVE-Finetune
 63 | │   │── llava_v1_5_mix665k.json
 64 | │   │── eve_instruct_mix1.8m.json
 65 | │   ├── ai2d
 66 | │   │   ├── images
 67 | │   │   ├── ...
 68 | │   ├── chartqa
 69 | │   │   ├── train
 70 | │   │   ├── val
 71 | │   │   ├── ...
 72 | │   ├── coco
 73 | │   │   ├── train2017
 74 | │   │   ├── ...
 75 | │   ├── docvqa
 76 | │   │   ├── train
 77 | │   │   ├── ...
 78 | │   ├── dvqa
 79 | │   │   ├── images
 80 | │   │   ├── ...
 81 | │   ├── gqa
 82 | │   │   ├── images
 83 | │   │   ├── ...
 84 | │   ├── ocr_vqa
 85 | │   │   ├── images
 86 | │   │   ├── ...
 87 | │   ├── open_images
 88 | │   │   ├── 0a0bc91825468c45.jpg
 89 | │   │   ├── ...
 90 | │   ├── syndog
 91 | │   │   ├── images
 92 | │   │   ├── ...
 93 | │   ├── textvqa
 94 | │   │   ├── train_images
 95 | │   │   ├── ...
 96 | │   ├── vg
 97 | │   │   ├── VG_100K
 98 | │   │   ├── VG_100K_2
 99 | │   │   ├── ...
100 | │   ├── Vision-Flan_vision-flan_191-task_1k
101 | │   │   ├── images_191task_1k
102 | │   │   ├── ...
103 | ```
104 | 


--------------------------------------------------------------------------------
/EVEv1/eve/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/eve/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/eve/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import EVELlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/EVEv1/eve/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | import openai
  7 | import ray
  8 | import tqdm
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | 
 13 | @ray.remote(num_cpus=4)
 14 | def get_eval(content: str, max_tokens: int):
 15 |     while True:
 16 |         try:
 17 |             response = openai.ChatCompletion.create(
 18 |                 model='gpt-4',
 19 |                 messages=[{
 20 |                     'role': 'system',
 21 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 22 |                 }, {
 23 |                     'role': 'user',
 24 |                     'content': content,
 25 |                 }],
 26 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 27 |                 max_tokens=max_tokens,
 28 |             )
 29 |             break
 30 |         except openai.error.RateLimitError:
 31 |             pass
 32 |         except Exception as e:
 33 |             print(e)
 34 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 35 | 
 36 |     print('success!')
 37 |     return response['choices'][0]['message']['content']
 38 | 
 39 | 
 40 | def parse_score(review):
 41 |     try:
 42 |         score_pair = review.split('\n')[0]
 43 |         score_pair = score_pair.replace(',', ' ')
 44 |         sp = score_pair.split(' ')
 45 |         if len(sp) == 2:
 46 |             return [float(sp[0]), float(sp[1])]
 47 |         else:
 48 |             print('error', review)
 49 |             return [-1, -1]
 50 |     except Exception as e:
 51 |         print(e)
 52 |         print('error', review)
 53 |         return [-1, -1]
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     parser = argparse.ArgumentParser(
 58 |         description='ChatGPT-based QA evaluation.')
 59 |     parser.add_argument('-q', '--question')
 60 |     # parser.add_argument('-a', '--answer')
 61 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 62 |     parser.add_argument('-r', '--rule')
 63 |     parser.add_argument('-o', '--output')
 64 |     parser.add_argument('--max-tokens', type=int, default=1024,
 65 |                         help='maximum number of tokens produced in the output')
 66 |     args = parser.parse_args()
 67 | 
 68 |     ray.init()
 69 | 
 70 |     f_q = open(os.path.expanduser(args.question))
 71 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 72 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 73 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 74 | 
 75 |     review_file = open(f'{args.output}', 'w')
 76 | 
 77 |     js_list = []
 78 |     handles = []
 79 |     idx = 0
 80 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 81 |         # if idx == 1:
 82 |         #     break
 83 | 
 84 |         ques = json.loads(ques_js)
 85 |         ans1 = json.loads(ans1_js)
 86 |         ans2 = json.loads(ans2_js)
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             rule = rule_dict['default']
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Question]\n{ques["text"]}\n\n'
 96 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 97 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 98 |                    f'[System]\n{prompt}\n\n')
 99 |         js_list.append({
100 |             'id': idx+1,
101 |             'question_id': ques['question_id'],
102 |             'answer1_id': ans1['answer_id'],
103 |             'answer2_id': ans2['answer_id'],
104 |             'category': category})
105 |         idx += 1
106 |         handles.append(get_eval.remote(content, args.max_tokens))
107 |         # To avoid the rate limit set by OpenAI
108 |         time.sleep(NUM_SECONDS_TO_SLEEP)
109 | 
110 |     reviews = ray.get(handles)
111 |     for idx, review in enumerate(reviews):
112 |         scores = parse_score(review)
113 |         js_list[idx]['content'] = review
114 |         js_list[idx]['tuple'] = scores
115 |         review_file.write(json.dumps(js_list[idx]) + '\n')
116 |     review_file.close()
117 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_gpt_review_visual.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | import openai
  7 | 
  8 | NUM_SECONDS_TO_SLEEP = 0.5
  9 | 
 10 | 
 11 | def get_eval(content: str, max_tokens: int):
 12 |     while True:
 13 |         try:
 14 |             response = openai.ChatCompletion.create(
 15 |                 model='gpt-4-0314',
 16 |                 messages=[{
 17 |                     'role': 'system',
 18 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 19 |                 }, {
 20 |                     'role': 'user',
 21 |                     'content': content,
 22 |                 }],
 23 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 24 |                 max_tokens=max_tokens,
 25 |             )
 26 |             break
 27 |         except openai.error.RateLimitError:
 28 |             pass
 29 |         except Exception as e:
 30 |             print(e)
 31 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 32 | 
 33 |     return response['choices'][0]['message']['content']
 34 | 
 35 | 
 36 | def parse_score(review):
 37 |     try:
 38 |         score_pair = review.split('\n')[0]
 39 |         score_pair = score_pair.replace(',', ' ')
 40 |         sp = score_pair.split(' ')
 41 |         if len(sp) == 2:
 42 |             return [float(sp[0]), float(sp[1])]
 43 |         else:
 44 |             print('error', review)
 45 |             return [-1, -1]
 46 |     except Exception as e:
 47 |         print(e)
 48 |         print('error', review)
 49 |         return [-1, -1]
 50 | 
 51 | 
 52 | if __name__ == '__main__':
 53 |     parser = argparse.ArgumentParser(
 54 |         description='ChatGPT-based QA evaluation.')
 55 |     parser.add_argument('-q', '--question')
 56 |     parser.add_argument('-c', '--context')
 57 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 58 |     parser.add_argument('-r', '--rule')
 59 |     parser.add_argument('-o', '--output')
 60 |     parser.add_argument('--max-tokens', type=int, default=1024,
 61 |                         help='maximum number of tokens produced in the output')
 62 |     args = parser.parse_args()
 63 | 
 64 |     f_q = open(os.path.expanduser(args.question))
 65 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 66 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 67 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 68 | 
 69 |     if os.path.isfile(os.path.expanduser(args.output)):
 70 |         cur_reviews = [json.loads(line) for line in open(
 71 |             os.path.expanduser(args.output))]
 72 |     else:
 73 |         cur_reviews = []
 74 | 
 75 |     review_file = open(f'{args.output}', 'a')
 76 | 
 77 |     context_list = [json.loads(line)
 78 |                     for line in open(os.path.expanduser(args.context))]
 79 |     image_to_context = {context['image']: context for context in context_list}
 80 | 
 81 |     handles = []
 82 |     idx = 0
 83 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 84 |         ques = json.loads(ques_js)
 85 |         ans1 = json.loads(ans1_js)
 86 |         ans2 = json.loads(ans2_js)
 87 | 
 88 |         inst = image_to_context[ques['image']]
 89 |         cap_str = '\n'.join(inst['captions'])
 90 |         box_str = '\n'.join(
 91 |             [f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
 92 | 
 93 |         category = json.loads(ques_js)['category']
 94 |         if category in rule_dict:
 95 |             rule = rule_dict[category]
 96 |         else:
 97 |             assert False, f"Visual QA category not found in rule file: {category}."
 98 |         prompt = rule['prompt']
 99 |         role = rule['role']
100 |         content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
101 |                    f'[Question]\n{ques["text"]}\n\n'
102 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
103 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
104 |                    f'[System]\n{prompt}\n\n')
105 |         cur_js = {
106 |             'id': idx+1,
107 |             'question_id': ques['question_id'],
108 |             'answer1_id': ans1.get('answer_id', ans1['question_id']),
109 |             'answer2_id': ans2.get('answer_id', ans2['answer_id']),
110 |             'category': category
111 |         }
112 |         if idx >= len(cur_reviews):
113 |             review = get_eval(content, args.max_tokens)
114 |             scores = parse_score(review)
115 |             cur_js['content'] = review
116 |             cur_js['tuple'] = scores
117 |             review_file.write(json.dumps(cur_js) + '\n')
118 |             review_file.flush()
119 |         else:
120 |             print(f'Skipping {idx} as we already have it.')
121 |         idx += 1
122 |         print(idx)
123 |     review_file.close()
124 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_one_sample.py:
--------------------------------------------------------------------------------
 1 | from eve.model.builder import load_pretrained_model
 2 | from eve.mm_utils import get_model_name_from_path
 3 | from eve.eval.run_eve import eval_model
 4 | 
 5 | model_path = "Absolute Path of BAAI/EVE-7B-HD-v1.0"
 6 | 
 7 | prompt = "Please describle image contents in detail."
 8 | image_file = "examples/ocr_beijing.jpg"
 9 | 
10 | args = type('Args', (), {
11 |     "model_path": model_path,
12 |     "model_base": None,
13 |     "model_name": get_model_name_from_path(model_path),
14 |     "query": prompt,
15 |     "conv_mode": None,
16 |     "image_file": image_file,
17 |     "sep": ",",
18 |     "temperature": 0,
19 |     "top_p": None,
20 |     "num_beams": 1,
21 |     "max_new_tokens": 512
22 | })()
23 | 
24 | eval_model(args)


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | 
 6 | def eval_pope(answers, label_file):
 7 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 8 | 
 9 |     for answer in answers:
10 |         text = answer['text']
11 | 
12 |         # Only keep the first sentence
13 |         if text.find('.') != -1:
14 |             text = text.split('.')[0]
15 | 
16 |         text = text.replace(',', '')
17 |         words = text.split(' ')
18 |         if 'No' in words or 'not' in words or 'no' in words:
19 |             answer['text'] = 'no'
20 |         else:
21 |             answer['text'] = 'yes'
22 | 
23 |     for i in range(len(label_list)):
24 |         if label_list[i] == 'no':
25 |             label_list[i] = 0
26 |         else:
27 |             label_list[i] = 1
28 | 
29 |     pred_list = []
30 |     for answer in answers:
31 |         if answer['text'] == 'no':
32 |             pred_list.append(0)
33 |         else:
34 |             pred_list.append(1)
35 | 
36 |     pos = 1
37 |     neg = 0
38 |     yes_ratio = pred_list.count(1) / len(pred_list)
39 | 
40 |     TP, TN, FP, FN = 0, 0, 0, 0
41 |     for pred, label in zip(pred_list, label_list):
42 |         if pred == pos and label == pos:
43 |             TP += 1
44 |         elif pred == pos and label == neg:
45 |             FP += 1
46 |         elif pred == neg and label == neg:
47 |             TN += 1
48 |         elif pred == neg and label == pos:
49 |             FN += 1
50 | 
51 |     print('TP\tFP\tTN\tFN\t')
52 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
53 | 
54 |     precision = float(TP) / float(TP + FP)
55 |     recall = float(TP) / float(TP + FN)
56 |     f1 = 2*precision*recall / (precision + recall)
57 |     acc = (TP + TN) / (TP + TN + FP + FN)
58 |     print('Accuracy: {}'.format(acc))
59 |     print('Precision: {}'.format(precision))
60 |     print('Recall: {}'.format(recall))
61 |     print('F1 score: {}'.format(f1))
62 |     print('Yes ratio: {}'.format(yes_ratio))
63 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' %
64 |           (f1, acc, precision, recall, yes_ratio))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument("--annotation-dir", type=str)
70 |     parser.add_argument("--question-file", type=str)
71 |     parser.add_argument("--result-file", type=str)
72 |     args = parser.parse_args()
73 | 
74 |     questions = [json.loads(line) for line in open(args.question_file)]
75 |     questions = {question['question_id']: question for question in questions}
76 |     answers = [json.loads(q) for q in open(args.result_file)]
77 |     for file in os.listdir(args.annotation_dir):
78 |         assert file.startswith('coco_pope_')
79 |         assert file.endswith('.json')
80 |         category = file[10:-5]
81 |         cur_answers = [
82 |             x for x in answers if questions[x['question_id']]['category'] == category]
83 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
84 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
85 |         print("====================================")
86 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from eve.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/model_qa.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import shortuuid
 6 | import torch
 7 | from tqdm import tqdm
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
 9 | 
10 | from eve.conversation import default_conversation
11 | from eve.utils import disable_torch_init
12 | 
13 | 
14 | # new stopping implementation
15 | class KeywordsStoppingCriteria(StoppingCriteria):
16 |     def __init__(self, keywords, tokenizer, input_ids):
17 |         self.keywords = keywords
18 |         self.tokenizer = tokenizer
19 |         self.start_len = None
20 |         self.input_ids = input_ids
21 | 
22 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
23 |         if self.start_len is None:
24 |             self.start_len = self.input_ids.shape[1]
25 |         else:
26 |             outputs = self.tokenizer.batch_decode(
27 |                 output_ids[:, self.start_len:], skip_special_tokens=True)[0]
28 |             for keyword in self.keywords:
29 |                 if keyword in outputs:
30 |                     return True
31 |         return False
32 | 
33 | 
34 | @torch.inference_mode()
35 | def eval_model(model_name, questions_file, answers_file):
36 |     # Model
37 |     disable_torch_init()
38 |     model_name = os.path.expanduser(model_name)
39 |     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
40 |     model = AutoModelForCausalLM.from_pretrained(model_name,
41 |                                                  torch_dtype=torch.float16).cuda()
42 | 
43 |     ques_file = open(os.path.expanduser(questions_file), "r")
44 |     ans_file = open(os.path.expanduser(answers_file), "w")
45 |     for i, line in enumerate(tqdm(ques_file)):
46 |         idx = json.loads(line)["question_id"]
47 |         qs = json.loads(line)["text"]
48 |         cat = json.loads(line)["category"]
49 |         conv = default_conversation.copy()
50 |         conv.append_message(conv.roles[0], qs)
51 |         prompt = conv.get_prompt()
52 |         inputs = tokenizer([prompt])
53 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
54 |         stopping_criteria = KeywordsStoppingCriteria(
55 |             [conv.sep], tokenizer, input_ids)
56 |         output_ids = model.generate(
57 |             input_ids,
58 |             do_sample=True,
59 |             use_cache=True,
60 |             temperature=0.7,
61 |             max_new_tokens=1024,
62 |             stopping_criteria=[stopping_criteria])
63 |         outputs = tokenizer.batch_decode(
64 |             output_ids, skip_special_tokens=True)[0]
65 |         try:
66 |             index = outputs.index(conv.sep, len(prompt))
67 |         except ValueError:
68 |             outputs += conv.sep
69 |             index = outputs.index(conv.sep, len(prompt))
70 | 
71 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
72 |         ans_id = shortuuid.uuid()
73 |         ans_file.write(json.dumps({"question_id": idx,
74 |                                    "text": outputs,
75 |                                    "answer_id": ans_id,
76 |                                    "model_id": model_name,
77 |                                    "metadata": {}}) + "\n")
78 |         ans_file.flush()
79 |     ans_file.close()
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     parser = argparse.ArgumentParser()
84 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
85 |     parser.add_argument("--question-file", type=str,
86 |                         default="tables/question.jsonl")
87 |     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
88 |     args = parser.parse_args()
89 | 
90 |     eval_model(args.model_name, args.question_file, args.answers_file)
91 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import concurrent.futures
 5 | import json
 6 | import os
 7 | import time
 8 | 
 9 | import openai
10 | import shortuuid
11 | import tqdm
12 | 
13 | openai.api_key = 'xxx' # replace with your own key
14 | MODEL = 'gpt-3.5-turbo'
15 | MODEL_ID = 'gpt-3.5-turbo:20230327'
16 | 
17 | 
18 | def get_answer(question_id: int, question: str, max_tokens: int):
19 |     ans = {
20 |         'answer_id': shortuuid.uuid(),
21 |         'question_id': question_id,
22 |         'model_id': MODEL_ID,
23 |     }
24 |     for _ in range(3):
25 |         try:
26 |             response = openai.ChatCompletion.create(
27 |                 model=MODEL,
28 |                 messages=[{
29 |                     'role': 'system',
30 |                     'content': 'You are a helpful assistant.'
31 |                 }, {
32 |                     'role': 'user',
33 |                     'content': question,
34 |                 }],
35 |                 temperature=0,
36 |                 max_tokens=max_tokens,
37 |             )
38 |             ans['text'] = response['choices'][0]['message']['content']
39 |             return ans
40 |         except Exception as e:
41 |             print('[ERROR]', e)
42 |             ans['text'] = '#ERROR#'
43 |             time.sleep(1)
44 |     return ans
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
49 |     parser.add_argument('-q', '--question')
50 |     parser.add_argument('-o', '--output')
51 |     parser.add_argument('--max-tokens', type=int, default=1024,
52 |                         help='maximum number of tokens produced in the output')
53 |     args = parser.parse_args()
54 | 
55 |     questions_dict = {}
56 |     with open(os.path.expanduser(args.question)) as f:
57 |         for line in f:
58 |             if not line:
59 |                 continue
60 |             q = json.loads(line)
61 |             questions_dict[q['question_id']] = q['text']
62 | 
63 |     answers = []
64 | 
65 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
66 |         futures = []
67 |         for qid, question in questions_dict.items():
68 |             future = executor.submit(
69 |                 get_answer, qid, question, args.max_tokens)
70 |             futures.append(future)
71 | 
72 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
73 |             answers.append(future.result())
74 | 
75 |     answers.sort(key=lambda x: x['question_id'])
76 | 
77 |     with open(os.path.expanduser(args.output), 'w') as f:
78 |         table = [json.dumps(ans) for ans in answers]
79 |         f.write('\n'.join(table))
80 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/run_eve.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from io import BytesIO
  3 | 
  4 | import requests
  5 | import torch
  6 | from PIL import Image
  7 | 
  8 | from eve.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
  9 |                                DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
 10 | from eve.conversation import SeparatorStyle, conv_templates
 11 | from eve.mm_utils import (KeywordsStoppingCriteria, process_images,
 12 |                               get_model_name_from_path, tokenizer_image_token)
 13 | from eve.model.builder import load_pretrained_model
 14 | from eve.utils import disable_torch_init
 15 | 
 16 | 
 17 | def load_image(image_file):
 18 |     if image_file.startswith('http') or image_file.startswith('https'):
 19 |         response = requests.get(image_file)
 20 |         image = Image.open(BytesIO(response.content)).convert('RGB')
 21 |     else:
 22 |         image = Image.open(image_file).convert('RGB')
 23 |     return image
 24 | 
 25 | 
 26 | def eval_model(args):
 27 |     # Model
 28 |     disable_torch_init()
 29 | 
 30 |     model_name = get_model_name_from_path(args.model_path)
 31 |     tokenizer, model, image_processor, context_len = load_pretrained_model(
 32 |         args.model_path, args.model_base, model_name)
 33 | 
 34 |     qs = args.query
 35 |     if model.config.mm_use_im_start_end:
 36 |         qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + \
 37 |             DEFAULT_IM_END_TOKEN + '\n' + qs
 38 |     else:
 39 |         qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
 40 | 
 41 |     if 'llama-2' in model_name.lower():
 42 |         conv_mode = "eve_llama_2"
 43 |     elif "v1" in model_name.lower():
 44 |         conv_mode = "eve_v1"
 45 |     elif "mpt" in model_name.lower():
 46 |         conv_mode = "mpt"
 47 |     else:
 48 |         conv_mode = "eve_v0"
 49 | 
 50 |     if args.conv_mode is not None and conv_mode != args.conv_mode:
 51 |         print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(
 52 |             conv_mode, args.conv_mode, args.conv_mode))
 53 |     else:
 54 |         args.conv_mode = conv_mode
 55 | 
 56 |     conv = conv_templates[args.conv_mode].copy()
 57 |     conv.append_message(conv.roles[0], qs)
 58 |     conv.append_message(conv.roles[1], None)
 59 |     prompt = conv.get_prompt()
 60 | 
 61 |     image = load_image(args.image_file)
 62 |     # image_tensor = image_processor.preprocess(image, return_tensors='pt')[
 63 |     #     'pixel_values'].half().cuda()
 64 |     image_tensor = process_images([image], image_processor, None)[0]
 65 |     image_tensor = image_tensor.unsqueeze(0).half().cuda()
 66 | 
 67 |     input_ids = tokenizer_image_token(
 68 |         prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 69 | 
 70 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
 71 |     keywords = [stop_str]
 72 |     stopping_criteria = KeywordsStoppingCriteria(
 73 |         keywords, tokenizer, input_ids)
 74 | 
 75 |     with torch.inference_mode():
 76 |         output_ids = model.generate(
 77 |             input_ids,
 78 |             images=image_tensor,
 79 |             do_sample=True,
 80 |             temperature=0.2,
 81 |             max_new_tokens=1024,
 82 |             use_cache=True,
 83 |             stopping_criteria=[stopping_criteria])
 84 | 
 85 |     input_token_len = input_ids.shape[1]
 86 |     n_diff_input_output = (
 87 |         input_ids != output_ids[:, :input_token_len]).sum().item()
 88 |     if n_diff_input_output > 0:
 89 |         print(
 90 |             f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 91 |     outputs = tokenizer.batch_decode(
 92 |         output_ids[:, input_token_len:], skip_special_tokens=True)[0]
 93 |     outputs = outputs.strip()
 94 |     if outputs.endswith(stop_str):
 95 |         outputs = outputs[:-len(stop_str)]
 96 |     outputs = outputs.strip()
 97 |     print(outputs)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     parser = argparse.ArgumentParser()
102 |     parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
103 |     parser.add_argument("--model-base", type=str, default=None)
104 |     parser.add_argument("--image-file", type=str, required=True)
105 |     parser.add_argument("--query", type=str, required=True)
106 |     parser.add_argument("--conv-mode", type=str, default=None)
107 |     args = parser.parse_args()
108 | 
109 |     eval_model(args)
110 | 


--------------------------------------------------------------------------------
/EVEv1/eve/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | from collections import defaultdict
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(
11 |         description='ChatGPT-based QA evaluation.')
12 |     parser.add_argument('-d', '--dir', default=None)
13 |     parser.add_argument('-v', '--version', default=None)
14 |     parser.add_argument('-s', '--select', nargs='*', default=None)
15 |     parser.add_argument('-f', '--files', nargs='*', default=[])
16 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
17 |     return parser.parse_args()
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     args = parse_args()
22 | 
23 |     if args.ignore is not None:
24 |         args.ignore = [int(x) for x in args.ignore]
25 | 
26 |     if len(args.files) > 0:
27 |         review_files = args.files
28 |     else:
29 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith(
30 |             'gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
31 | 
32 |     for review_file in sorted(review_files):
33 |         config = os.path.basename(review_file).replace(
34 |             'gpt4_text_', '').replace('.jsonl', '')
35 |         if args.select is not None and any(x not in config for x in args.select):
36 |             continue
37 |         if '0613' in config:
38 |             version = '0613'
39 |         else:
40 |             version = '0314'
41 |         if args.version is not None and args.version != version:
42 |             continue
43 |         scores = defaultdict(list)
44 |         print(config)
45 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
46 |             for review_str in f:
47 |                 review = json.loads(review_str)
48 |                 if review['question_id'] in args.ignore:
49 |                     continue
50 |                 if 'category' in review:
51 |                     scores[review['category']].append(review['tuple'])
52 |                     scores['all'].append(review['tuple'])
53 |                 else:
54 |                     if 'tuple' in review:
55 |                         scores['all'].append(review['tuple'])
56 |                     else:
57 |                         scores['all'].append(review['score'])
58 |         for k, v in sorted(scores.items()):
59 |             stats = np.asarray(v).mean(0).tolist()
60 |             stats = [round(x, 3) for x in stats]
61 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
62 |             print(k, round(stats[1]/stats[0]*100, 1),
63 |                   round(stats[0] * 10, 1), round(stats[1] * 10, 1))
64 |         print('=================================')
65 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.eve_llama import (EVEConfig,EVELlamaForCausalLM)
2 | from .multimodal_encoder.configuration_evaclip import EvaCLIPVisionConfig
3 | from .multimodal_encoder.modeling_evaclip import EvaCLIPVisionModel
4 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m eve.model.consolidate --src ~/model_weights/eve-7b --dst ~/model_weights/eve-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from eve.model import *
11 | from eve.model.utils import auto_upgrade
12 | 
13 | 
14 | def consolidate_ckpt(src_path, dst_path):
15 |     print("Loading model")
16 |     auto_upgrade(src_path)
17 |     src_model = AutoModelForCausalLM.from_pretrained(
18 |         src_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
19 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
20 |     src_model.save_pretrained(dst_path)
21 |     src_tokenizer.save_pretrained(dst_path)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--src", type=str, required=True)
27 |     parser.add_argument("--dst", type=str, required=True)
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     consolidate_ckpt(args.src, args.dst)
32 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .clip_encoder import CLIPVisionTower
 4 | 
 5 | 
 6 | def build_vision_tower(vision_tower_cfg, **kwargs):
 7 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower_clip', 
 8 |                            getattr(vision_tower_cfg, 'vision_tower_clip', None))
 9 |     is_absolute_path_exists = os.path.exists(vision_tower)
10 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or vision_tower.startswith("Lin-Chen"):
11 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
12 | 
13 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
14 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/multimodal_encoder/clip_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
 4 | 
 5 | from .configuration_evaclip import EvaCLIPVisionConfig
 6 | from .modeling_evaclip import EvaCLIPVisionModel
 7 | 
 8 | 
 9 | class CLIPVisionTower(nn.Module):
10 |     def __init__(self, vision_tower, args, delay_load=False):
11 |         super().__init__()
12 | 
13 |         self.is_loaded = False
14 | 
15 |         self.vision_tower_name = vision_tower
16 |         self.select_layer = args.mm_vision_select_layer
17 |         self.select_feature = getattr(
18 |             args, 'mm_vision_select_feature', 'patch')
19 | 
20 |         if not delay_load:
21 |             self.load_model()
22 |         else:
23 |             self.cfg_only = CLIPVisionConfig.from_pretrained(
24 |                 self.vision_tower_name)
25 | 
26 |     def load_model(self):
27 |         print(f'Load vision tower from {self.vision_tower_name}')
28 |         self.image_processor = CLIPImageProcessor.from_pretrained(
29 |             self.vision_tower_name)
30 |         if 'eva' in self.vision_tower_name.lower():
31 |             vision_cfg = EvaCLIPVisionConfig.from_pretrained(
32 |                 self.vision_tower_name)
33 |             self.backbone = EvaCLIPVisionModel.from_pretrained(
34 |                 self.vision_tower_name, config=vision_cfg)
35 |         else:
36 |             self.backbone = CLIPVisionModel.from_pretrained(
37 |                 self.vision_tower_name)
38 |         self.backbone.requires_grad_(False)
39 | 
40 |         self.is_loaded = True
41 | 
42 |     def feature_select(self, image_forward_outs):
43 |         image_features = image_forward_outs.hidden_states[self.select_layer]
44 |         if self.select_feature == 'patch':
45 |             image_features = image_features[:, 1:]
46 |         elif self.select_feature == 'cls_patch':
47 |             image_features = image_features
48 |         else:
49 |             raise ValueError(
50 |                 f'Unexpected select feature: {self.select_feature}')
51 |         return image_features
52 | 
53 |     # @torch.no_grad() comment to enable fine-tune vit
54 |     def forward(self, images):
55 |         if type(images) is list:
56 |             image_features = []
57 |             for image in images:
58 |                 image_forward_out = self.backbone(image.to(
59 |                     device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
60 |                 image_feature = self.feature_select(
61 |                     image_forward_out).to(image.dtype)
62 |                 image_features.append(image_feature)
63 |         else:
64 |             image_forward_outs = self.backbone(
65 |                 images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
66 |             image_features = self.feature_select(
67 |                 image_forward_outs).to(images.dtype)
68 | 
69 |         return image_features
70 | 
71 |     @property
72 |     def dummy_feature(self):
73 |         return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
74 | 
75 |     @property
76 |     def dtype(self):
77 |         return self.backbone.dtype
78 | 
79 |     @property
80 |     def device(self):
81 |         return self.backbone.device
82 | 
83 |     @property
84 |     def config(self):
85 |         if self.is_loaded:
86 |             return self.backbone.config
87 |         else:
88 |             return self.cfg_only
89 | 
90 |     @property
91 |     def hidden_size(self):
92 |         return self.config.hidden_size
93 | 
94 |     @property
95 |     def num_patches(self):
96 |         return (self.config.image_size // self.config.patch_size) ** 2
97 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 | 
29 |     def forward(self, x):
30 |         x = self.pre_norm(x)
31 |         return x + self.proj(x)
32 | 
33 | 
34 | def build_vision_projector(config, delay_load=False, **kwargs):
35 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
36 | 
37 |     if projector_type == 'linear':
38 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
39 | 
40 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
41 |     if mlp_gelu_match:
42 |         mlp_depth = int(mlp_gelu_match.group(1))
43 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
44 |         for _ in range(1, mlp_depth):
45 |             modules.append(nn.GELU())
46 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
47 |         return nn.Sequential(*modules)
48 | 
49 |     if projector_type == 'identity':
50 |         return IdentityMap()
51 | 
52 |     raise ValueError(f'Unknown projector type: {projector_type}')
53 | 


--------------------------------------------------------------------------------
/EVEv1/eve/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'eve' in config and 'eve' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer EVE code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "eve")
15 |             cfg.architectures[0] = 'EVELlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/EVEv1/eve/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from eve.train.train import train
 7 | from eve.train.llama_flash_attn_monkey_patch import \
 8 |     replace_llama_attn_with_flash_attn
 9 | 
10 | replace_llama_attn_with_flash_attn()
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     train()
15 | 


--------------------------------------------------------------------------------
/EVEv1/eve/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from eve.train.train import train
 5 | from eve.train.llama_xformers_attn_monkey_patch import \
 6 |     replace_llama_attn_with_xformers_attn
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     train()
13 | 


--------------------------------------------------------------------------------
/EVEv1/eve/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def disable_torch_init():
 5 |     """
 6 |     Disable the redundant torch default initialization to accelerate model creation.
 7 |     """
 8 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 9 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
10 | 


--------------------------------------------------------------------------------
/EVEv1/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/examples/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/examples/beach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/examples/beach.png


--------------------------------------------------------------------------------
/EVEv1/examples/mac.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/examples/mac.jpg


--------------------------------------------------------------------------------
/EVEv1/examples/ocr_beijing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/examples/ocr_beijing.jpg


--------------------------------------------------------------------------------
/EVEv1/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/images/eve_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/eve_logo.png


--------------------------------------------------------------------------------
/EVEv1/images/eve_motivation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/eve_motivation1.png


--------------------------------------------------------------------------------
/EVEv1/images/eve_motivation2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/eve_motivation2.png


--------------------------------------------------------------------------------
/EVEv1/images/eve_results.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/eve_results.jpg


--------------------------------------------------------------------------------
/EVEv1/images/eve_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/images/eve_structure.png


--------------------------------------------------------------------------------
/EVEv1/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "eve"
 7 | version = "1.0.0"
 8 | description = "Unveiling Encoder-Free Vision-Language Models."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "torch==2.0.1", "torchvision==0.15.2",
17 |     "transformers==4.31.0", "tokenizers>=0.12.1,<0.14", "sentencepiece==0.1.99", "shortuuid",
18 |     "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0", "xformers==0.0.21",
19 |     "markdown2[all]", "numpy", "scikit-learn==1.2.2", "urllib3==1.26.0",
20 |     "gradio==4.5.0", "requests", "httpx==0.24.0", "uvicorn", "fastapi","ftfy",
21 |     "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13", "prettytable", "openpyxl"
22 | ]
23 | 
24 | [project.optional-dependencies]
25 | train = ["deepspeed", "ninja", "wandb", "tensorboardX"]
26 | 
27 | [project.urls]
28 | "Homepage" = "https://eve.github.io/"
29 | "Bug Tracker" = "https://github.com/baaivision/EVE/issues"
30 | 
31 | [tool.setuptools.packages.find]
32 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
33 | 
34 | [tool.wheel]
35 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
36 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/scripts/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--src", type=str)
 6 | parser.add_argument("--dst", type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | all_answers = []
10 | for line_idx, line in enumerate(open(args.src)):
11 |     res = json.loads(line)
12 |     question_id = res['question_id']
13 |     text = res['text'].rstrip('.').lower()
14 |     all_answers.append({"questionId": question_id, "prediction": text})
15 | 
16 | with open(args.dst, 'w') as f:
17 |     json.dump(all_answers, f)
18 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def get_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--annotation-file", type=str, required=True)
11 |     parser.add_argument("--result-dir", type=str, required=True)
12 |     parser.add_argument("--upload-dir", type=str, required=True)
13 |     parser.add_argument("--experiment", type=str, required=True)
14 | 
15 |     return parser.parse_args()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     args = get_args()
20 | 
21 |     df = pd.read_table(args.annotation_file)
22 | 
23 |     cur_df = df.copy()
24 |     cur_df = cur_df.drop(
25 |         columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
26 |     cur_df.insert(6, 'prediction', None)
27 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
28 |         pred = json.loads(pred)
29 |         cur_df.loc[df['index'] == pred['question_id'],
30 |                    'prediction'] = pred['text']
31 | 
32 |     cur_df.to_excel(os.path.join(args.upload_dir,
33 |                     f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
34 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--annotation-file", type=str)
 8 |     parser.add_argument("--result-file", type=str)
 9 |     parser.add_argument("--result-upload-file", type=str)
10 |     parser.add_argument('-t', "--tolerance", action='store_true')
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type:
24 |             continue
25 |         data_type = question_data['question_type_id']
26 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
27 |         try:
28 |             question_id = int(question_data['question_id'])
29 |         except:
30 |             question_id = question_data['question_id']
31 |         if question_id not in results:
32 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
33 |             continue
34 |         row = results[question_id]
35 |         if args.tolerance:
36 |             if row['text'] == question_data['answer'] or row['text'][0] == question_data['answer']:
37 |                 correct_counts[data_type] = correct_counts.get(
38 |                     data_type, 0) + 1
39 |         else:
40 |             if row['text'] == question_data['answer']:
41 |                 correct_counts[data_type] = correct_counts.get(
42 |                     data_type, 0) + 1
43 | 
44 |     total_count = 0
45 |     total_correct = 0
46 |     for data_type in sorted(type_counts.keys()):
47 |         correct_count = correct_counts.get(data_type, 0)
48 |         total_questions = type_counts[data_type]
49 | 
50 |         # 防止除以0的情况
51 |         accuracy = (correct_count / total_questions *
52 |                     100) if total_questions > 0 else 0.0
53 |         if eval_only_type is None:
54 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
55 | 
56 |         total_count += type_counts[data_type]
57 |         total_correct += correct_count
58 | 
59 |     total_accuracy = total_correct / total_count * 100
60 |     if eval_only_type is None:
61 |         print(f"Total accuracy: {total_accuracy:.2f}%")
62 |     else:
63 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
64 | 
65 |     return results
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     args = get_args()
70 |     data = json.load(open(args.annotation_file))
71 |     ques_type_id_to_name = {id: n for n, id in data['question_type'].items()}
72 | 
73 |     results = eval_single(args.result_file)
74 |     eval_single(args.result_file, eval_only_type='image')
75 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_sqa_to_eve.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import fire
 5 | 
 6 | from scripts.convert_sqa_to_eve_base_prompt import build_prompt_chatbot
 7 | 
 8 | 
 9 | def convert_to_eve(base_dir, split, prompt_format="QCM-LEA"):
10 |     split_indices = json.load(
11 |         open(os.path.join(base_dir, "pid_splits.json")))[split]
12 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
13 | 
14 |     split_problems = build_prompt_chatbot(
15 |         problems, split_indices, prompt_format,
16 |         use_caption=False, is_test=False)
17 | 
18 |     target_format = []
19 |     for prob_id, (input, output) in split_problems.items():
20 |         if input.startswith('Question: '):
21 |             input = input.replace('Question: ', '')
22 |         if output.startswith('Answer: '):
23 |             output = output.replace('Answer: ', '')
24 | 
25 |         raw_prob_data = problems[prob_id]
26 |         if raw_prob_data['image'] is None:
27 |             target_format.append({
28 |                 "id": prob_id,
29 |                 "conversations": [
30 |                     {'from': 'human', 'value': f"{input}"},
31 |                     {'from': 'gpt', 'value': f"{output}"},
32 |                 ],
33 |             })
34 | 
35 |         else:
36 |             target_format.append({
37 |                 "id": prob_id,
38 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
39 |                 "conversations": [
40 |                     {'from': 'human', 'value': f"{input}\n<image>"},
41 |                     {'from': 'gpt', 'value': f"{output}"},
42 |                 ],
43 |             })
44 | 
45 |     print(f'Number of samples: {len(target_format)}')
46 | 
47 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
48 |         json.dump(target_format, f, indent=2)
49 | 
50 | 
51 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
52 |     split_indices = json.load(
53 |         open(os.path.join(base_dir, "pid_splits.json")))[split]
54 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
55 | 
56 |     split_problems = build_prompt_chatbot(
57 |         problems, split_indices, prompt_format,
58 |         use_caption=False, is_test=False)
59 | 
60 |     writer = open(os.path.join(
61 |         base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
62 |     for prob_id, (input, output) in split_problems.items():
63 |         if input.startswith('Question: '):
64 |             input = input.replace('Question: ', '')
65 |         if output.startswith('Answer: '):
66 |             output = output.replace('Answer: ', '')
67 | 
68 |         raw_prob_data = problems[prob_id]
69 |         if raw_prob_data['image'] is None:
70 |             data = {
71 |                 "id": prob_id,
72 |                 "instruction": f"{input}",
73 |                 "output": f"{output}",
74 |             }
75 | 
76 |         else:
77 |             data = {
78 |                 "id": prob_id,
79 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
80 |                 "instruction": f"{input}\n<image>",
81 |                 "output": f"{output}",
82 |             }
83 |         writer.write(json.dumps(data) + '\n')
84 |     writer.close()
85 | 
86 | 
87 | def main(task, **kwargs):
88 |     globals()[task](**kwargs)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     fire.Fire(main)
93 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import sys
 6 | from eve.eval.m4c_evaluator import EvalAIAnswerProcessor
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str, required=True)
12 |     parser.add_argument('--result-file', type=str, required=True)
13 |     parser.add_argument('--result-upload-file', type=str, required=True)
14 |     return parser.parse_args()
15 | 
16 | 
17 | if __name__ == '__main__':
18 | 
19 |     args = parse_args()
20 | 
21 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
22 | 
23 |     results = []
24 |     error_line = 0
25 |     for line_idx, line in enumerate(open(args.result_file)):
26 |         try:
27 |             results.append(json.loads(line))
28 |         except:
29 |             error_line += 1
30 |     results = {x['question_id']: x['text'] for x in results}
31 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
32 |     split_ids = set([x['question_id'] for x in test_split])
33 | 
34 |     print(
35 |         f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
36 | 
37 |     all_answers = []
38 | 
39 |     answer_processor = EvalAIAnswerProcessor()
40 | 
41 |     for x in test_split:
42 |         assert x['question_id'] in results
43 |         all_answers.append({
44 |             'image': x['image'],
45 |             'answer': answer_processor(results[x['question_id']])
46 |         })
47 | 
48 |     with open(args.result_upload_file, 'w') as f:
49 |         json.dump(all_answers, f)
50 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | import sys
 6 | from eve.eval.m4c_evaluator import EvalAIAnswerProcessor
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
12 |     parser.add_argument('--ckpt', type=str, required=True)
13 |     parser.add_argument('--split', type=str, required=True)
14 |     return parser.parse_args()
15 | 
16 | 
17 | if __name__ == '__main__':
18 | 
19 |     args = parse_args()
20 | 
21 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
22 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
23 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
24 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
25 | 
26 |     results = []
27 |     error_line = 0
28 |     for line_idx, line in enumerate(open(src)):
29 |         try:
30 |             results.append(json.loads(line))
31 |         except:
32 |             error_line += 1
33 | 
34 |     results = {x['question_id']: x['text'] for x in results}
35 |     test_split = [json.loads(line) for line in open(test_split)]
36 |     split_ids = set([x['question_id'] for x in test_split])
37 | 
38 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
39 | 
40 |     all_answers = []
41 | 
42 |     answer_processor = EvalAIAnswerProcessor()
43 | 
44 |     for x in test_split:
45 |         if x['question_id'] not in results:
46 |             all_answers.append({
47 |                 'question_id': x['question_id'],
48 |                 'answer': ''
49 |             })
50 |         else:
51 |             all_answers.append({
52 |                 'question_id': x['question_id'],
53 |                 'answer': answer_processor(results[x['question_id']])
54 |             })
55 | 
56 |     with open(dst, 'w') as f:
57 |         json.dump(all_answers, open(dst, 'w'))
58 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv1/scripts/eve/.DS_Store


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | 
10 | SPLIT="eve_gqa_testdev_balanced"
11 | GQADIR="./playground/data/eval/gqa/data"
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
15 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
16 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
17 |         --image-folder ./playground/data/eval/gqa/data/images \
18 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl \
19 |         --num-chunks $CHUNKS \
20 |         --chunk-idx $IDX \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
38 | 
39 | cd $GQADIR
40 | python eval/eval.py --tier testdev_balanced
41 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | 
 5 | python -m eve.eval.model_vqa \
 6 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 7 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/resources/questions.jsonl \
 8 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/resources/images \
 9 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
14 | 
15 | python eve/eval/eval_gpt_review_bench.py \
16 |     --question playground/data/eval/llava-bench-in-the-wild/resources/questions.jsonl \
17 |     --context playground/data/eval/llava-bench-in-the-wild/resources/context.jsonl \
18 |     --rule eve/eval/table/rule.json \
19 |     --answer-list \
20 |         playground/data/eval/llava-bench-in-the-wild/resources/answers_gpt4.jsonl \
21 |         playground/data/eval/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl \
22 |     --output \
23 |         playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT_NAME}-eval1.jsonl
24 | 
25 | python eve/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT_NAME}-eval1.jsonl
26 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | CHUNKS=${#GPULIST[@]}
 5 | 
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | LANG="cn"
 9 | SPLIT="mmbench_dev_cn_20231003"
10 | 
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_mmbench \
14 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
15 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
16 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --lang $LANG \
20 |         --single-pred-prompt \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | wait
38 | 
39 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
40 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME}
41 | 
42 | python scripts/convert_mmbench_for_submission.py \
43 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
44 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME} \
45 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME} \
46 |     --experiment merge
47 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/mmbench_en.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | CHUNKS=${#GPULIST[@]}
 5 | 
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | SPLIT="mmbench_dev_20230712"
 9 | LANG="en"
10 | 
11 | 
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_mmbench \
14 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
15 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
16 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --lang $LANG \
20 |         --single-pred-prompt \
21 |         --temperature 0 \
22 |         --conv-mode vicuna_v1 &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | wait
38 | 
39 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
40 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME}
41 | 
42 | python scripts/convert_mmbench_for_submission.py \
43 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
44 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME} \
45 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME} \
46 |     --experiment merge
47 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | 
 5 | python -m eve.eval.model_vqa_loader \
 6 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 7 |     --question-file ./playground/data/eval/MME/eve_mme.jsonl \
 8 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
 9 |     --answers-file ./playground/data/eval/MME/answers/${CKPT_NAME}.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | cd ./playground/data/eval/MME
14 | 
15 | python convert_answer_to_mme.py --experiment ${CKPT_NAME}
16 | 
17 | cd eval_tool
18 | 
19 | python calculation.py --results_dir answers/${CKPT_NAME}
20 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | 
 5 | python -m eve.eval.model_vqa \
 6 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 7 |     --question-file ./playground/data/eval/mm-vet/eve-mm-vet.jsonl \
 8 |     --image-folder ./playground/data/eval/mm-vet/images \
 9 |     --answers-file ./playground/data/eval/mm-vet/answers/${CKPT_NAME}.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | mkdir -p ./playground/data/eval/mm-vet/results
14 | 
15 | python scripts/convert_mmvet_for_eval.py \
16 |     --src ./playground/data/eval/mm-vet/answers/${CKPT_NAME}.jsonl \
17 |     --dst ./playground/data/eval/mm-vet/results/${CKPT_NAME}.json
18 | 
19 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | 
 5 | python -m eve.eval.model_vqa_loader \
 6 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 7 |     --question-file ./playground/data/eval/pope/eve_pope_test.jsonl \
 8 |     --image-folder ./playground/data/eval/pope/val2014 \
 9 |     --answers-file ./playground/data/eval/pope/answers/${CKPT_NAME}.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | python eve/eval/eval_pope.py \
14 |     --annotation-dir ./playground/data/eval/pope/coco \
15 |     --question-file ./playground/data/eval/pope/eve_pope_test.jsonl \
16 |     --result-file ./playground/data/eval/pope/answers/${CKPT_NAME}.jsonl
17 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/qbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | SPLIT="dev"
 5 | 
 6 | python -m eve.eval.model_vqa_qbench \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
 9 |     --questions-file ./playground/data/eval/qbench/llvisionqa_${SPLIT}.json \
10 |     --answers-file ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl \
11 |     --conv-mode eve_v1 \
12 |     --lang en
13 | 
14 | python playground/data/eval/qbench/format_qbench.py \
15 |     --filepath ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl
16 | 
17 | python playground/data/eval/qbench/qbench_eval.py \
18 |     --filepath ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
12 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
13 |         --question-file ./playground/data/eval/seed_bench/eve-seed-bench-image.jsonl \
14 |         --image-folder ./playground/data/eval/seed_bench \
15 |         --answers-file ./playground/data/eval/seed_bench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
16 |         --num-chunks $CHUNKS \
17 |         --chunk-idx $IDX \
18 |         --temperature 0 \
19 |         --conv-mode vicuna_v1 &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=./playground/data/eval/seed_bench/answers/${CKPT_NAME}/merge.jsonl
25 | 
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 | 
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ./playground/data/eval/seed_bench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | # Evaluate
35 | python scripts/convert_seed_for_submission.py \
36 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
37 |     --result-file $output_file \
38 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/${CKPT_NAME}.jsonl
39 | 
40 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_science \
12 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
13 |     --question-file ./playground/data/eval/scienceqa/eve_test_CQM-A.json \
14 |     --image-folder ./playground/data/eval/scienceqa/images/test \
15 |     --answers-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
16 |     --num-chunks $CHUNKS \
17 |     --chunk-idx $IDX \
18 |     --single-pred-prompt \
19 |     --temperature 0 \
20 |     --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/scienceqa/answers/${CKPT_NAME}/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python eve/eval/eval_science_qa.py \
36 |     --base-dir ./playground/data/eval/scienceqa \
37 |     --result-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/merge.jsonl \
38 |     --output-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/output.jsonl \
39 |     --output-result ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/result.json
40 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | 
 9 | for IDX in $(seq 0 $((CHUNKS-1))); do
10 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
11 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
12 |         --question-file ./playground/data/eval/textvqa/eve_textvqa_val_v051_ocr.jsonl \
13 |         --image-folder ./playground/data/eval/textvqa/images \
14 |         --answers-file ./playground/data/eval/textvqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
15 |         --num-chunks $CHUNKS \
16 |         --chunk-idx $IDX \
17 |         --temperature 0 \
18 |         --conv-mode vicuna_v1 &
19 | done
20 | 
21 | wait
22 | 
23 | output_file=./playground/data/eval/textvqa/answers/$CKPT_NAME/merge.jsonl
24 | 
25 | # Clear out the output file if it exists.
26 | > "$output_file"
27 | 
28 | # Loop through the indices and concatenate each file.
29 | for IDX in $(seq 0 $((CHUNKS-1))); do
30 |     cat ./playground/data/eval/textvqa/answers/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
31 | done
32 | 
33 | python -m eve.eval.eval_textvqa \
34 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
35 |     --result-file ./playground/data/eval/textvqa/answers/${CKPT_NAME}/merge.jsonl
36 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | 
 5 | python -m eve.eval.model_vqa_loader \
 6 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 7 |     --question-file ./playground/data/eval/vizwiz/eve_test.jsonl \
 8 |     --image-folder ./playground/data/eval/vizwiz/test \
 9 |     --answers-file ./playground/data/eval/vizwiz/answers/${CKPT_NAME}.jsonl \
10 |     --temperature 0 \
11 |     --conv-mode vicuna_v1
12 | 
13 | python scripts/convert_vizwiz_for_submission.py \
14 |     --annotation-file ./playground/data/eval/vizwiz/eve_test.jsonl \
15 |     --result-file ./playground/data/eval/vizwiz/answers/${CKPT_NAME}.jsonl \
16 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${CKPT_NAME}.json
17 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | SPLIT='eve_vqav2_mscoco_test-dev2015'
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
13 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
14 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
15 |         --image-folder ./playground/data/eval/vqav2/test2015 \
16 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --temperature 0 \
20 |         --conv-mode vicuna_v1 &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT_NAME
36 | 
37 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eve7b_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | 
11 | set -x
12 | 
13 | wandb login
14 | apt-get install -y libibverbs1
15 | mkdir -p logs
16 | 
17 | export CUDA_DEVICE_MAX_CONNECTIONS=1
18 | export GPUS_PER_NODE=8
19 | export NNODES=4
20 | export MASTER_PORT=23456
21 | export CPUS_PER_TASK=32
22 | export QUOTA=reserved
23 | 
24 | export DATA_PATH=playground/data/EVE-Finetune/llava_v1_5_mix665k.json
25 | export IMAGE_PATH=playground/data/EVE-Finetune
26 | export VIT_PATH=openai/eve-patch14-anypixel-672
27 | export VIT_PATH_CLIP=openai/clip-vit-large-patch14-336
28 | export BASE_LR=2e-5
29 | export LEARNIG_RATE=2e-5
30 | 
31 | export CKPT_PATH=checkpoints/eve-7b-prtr1-672-mse (or EVE-7B-Pretrain-v1.0)
32 | export SAVE_PATH=eve-7b-fitu-672-mse (or EVE-7B-v1.0)
33 | 
34 | 
35 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
36 |     eve/train/train_mem.py \
37 |     --model_name_or_path ${CKPT_PATH} \
38 |     --deepspeed ./scripts/zero3.json \
39 |     --version v1 \
40 |     --data_path ${DATA_PATH} \
41 |     --image_folder ${IMAGE_PATH} \
42 |     --vision_tower ${VIT_PATH} \
43 |     --vision_tower_clip ${VIT_PATH_CLIP} \
44 |     --requires_cliploss True \
45 |     --mm_projector_type mlp2x_gelu \
46 |     --mm_vision_select_layer -2 \
47 |     --mm_use_im_start_end False \
48 |     --mm_use_im_patch_token False \
49 |     --group_by_modality_length True \
50 |     --bf16 True \
51 |     --output_dir checkpoints/${SAVE_PATH} \
52 |     --num_train_epochs 1 \
53 |     --per_device_train_batch_size 4 \
54 |     --per_device_eval_batch_size 4 \
55 |     --gradient_accumulation_steps 1 \
56 |     --evaluation_strategy "no" \
57 |     --save_strategy "steps" \
58 |     --save_steps 50000 \
59 |     --save_total_limit 1 \
60 |     --learning_rate ${BASE_LR} \
61 |     --mm_projector_lr ${LEARNIG_RATE} \
62 |     --vision_tower_lr ${LEARNIG_RATE} \
63 |     --weight_decay 0. \
64 |     --warmup_ratio 0.01 \
65 |     --lr_scheduler_type "cosine" \
66 |     --logging_steps 1 \
67 |     --tf32 True \
68 |     --model_max_length 2048 \
69 |     --gradient_checkpointing True \
70 |     --dataloader_num_workers 4 \
71 |     --lazy_preprocess True \
72 |     --report_to wandb \
73 |     --run_name ${SAVE_PATH} \
74 |     2>&1 | tee logs/${SAVE_PATH}-rank$1-$(date "+%Y-%m-%d|%H:%M:%S").log


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eve7b_finetune_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | 
11 | set -x
12 | 
13 | wandb login
14 | apt-get install -y libibverbs1
15 | mkdir -p logs
16 | 
17 | export CUDA_DEVICE_MAX_CONNECTIONS=1
18 | export GPUS_PER_NODE=8
19 | export NNODES=4
20 | export MASTER_PORT=23456
21 | export CPUS_PER_TASK=32
22 | export QUOTA=reserved
23 | 
24 | export DATA_PATH=playground/data/EVE-Finetune/eve_instruct_mix1.8m.json
25 | export IMAGE_PATH=playground/data/EVE-Finetune
26 | export VIT_PATH=openai/eve-patch14-anypixel-1344
27 | export VIT_PATH_CLIP=openai/clip-vit-large-patch14-336
28 | export BASE_LR=2e-5
29 | export LEARNIG_RATE=2e-5
30 | 
31 | export CKPT_PATH=checkpoints/eve-7b-prtr1-672-mse (or EVE-7B-Pretrain-v1.0)
32 | export SAVE_PATH=eve-7b-fitu-1344-mse (or EVE-7B-HD-v1.0)
33 | 
34 | 
35 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
36 |     eve/train/train_mem.py \
37 |     --model_name_or_path ${CKPT_PATH} \
38 |     --deepspeed ./scripts/zero3.json \
39 |     --version v1 \
40 |     --data_path ${DATA_PATH} \
41 |     --image_folder ${IMAGE_PATH} \
42 |     --vision_tower ${VIT_PATH} \
43 |     --vision_tower_clip ${VIT_PATH_CLIP} \
44 |     --requires_cliploss True \
45 |     --mm_projector_type mlp2x_gelu \
46 |     --mm_vision_select_layer -2 \
47 |     --mm_use_im_start_end False \
48 |     --mm_use_im_patch_token False \
49 |     --group_by_modality_length True \
50 |     --bf16 True \
51 |     --output_dir checkpoints/${SAVE_PATH} \
52 |     --num_train_epochs 1 \
53 |     --per_device_train_batch_size 2 \
54 |     --per_device_eval_batch_size 4 \
55 |     --gradient_accumulation_steps 2 \
56 |     --evaluation_strategy "no" \
57 |     --save_strategy "steps" \
58 |     --save_steps 50000 \
59 |     --save_total_limit 1 \
60 |     --learning_rate ${BASE_LR} \
61 |     --mm_projector_lr ${LEARNIG_RATE} \
62 |     --vision_tower_lr ${LEARNIG_RATE} \
63 |     --weight_decay 0. \
64 |     --warmup_ratio 0.01 \
65 |     --lr_scheduler_type "cosine" \
66 |     --logging_steps 1 \
67 |     --tf32 True \
68 |     --model_max_length 4096 \
69 |     --gradient_checkpointing True \
70 |     --dataloader_num_workers 4 \
71 |     --lazy_preprocess True \
72 |     --report_to wandb \
73 |     --run_name ${SAVE_PATH} \
74 |     2>&1 | tee logs/${SAVE_PATH}-rank$1-$(date "+%Y-%m-%d|%H:%M:%S").log


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eve7b_prealign.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | 
11 | set -x
12 | 
13 | wandb login
14 | apt-get install -y libibverbs1
15 | mkdir -p logs
16 | 
17 | export CUDA_DEVICE_MAX_CONNECTIONS=1
18 | export GPUS_PER_NODE=8
19 | export NNODES=4
20 | export MASTER_PORT=23456
21 | export CPUS_PER_TASK=32
22 | export QUOTA=reserved
23 | 
24 | # We have no specific plan to release pretraining data. 
25 | # However, you can download and filter images according to our paper's guidelines. 
26 | # Then, use LLaVA-NEXT to generate high-definition image descriptions, which would provide better results.
27 | 
28 | export DATA_PATH=playground/data/EVE-Pretrain-33M/eve_pretrain_cap33m.json
29 | export IMAGE_PATH=playground/data/EVE-Pretrain-33M
30 | export VIT_PATH=openai/eve-patch14-anypixel-672
31 | export VIT_PATH_CLIP=openai/clip-vit-large-patch14-336
32 | export BASE_LR=4e-5
33 | export LEARNIG_RATE=4e-4
34 | 
35 | export CKPT_PATH=lmsys/vicuna-7b-v1.5
36 | export SAVE_PATH=eve-7b-prtr0-672-mse (or EVE-7B-Prealign-v1.0)
37 | 
38 | 
39 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
40 |     eve/train/train_mem.py \
41 |     --model_name_or_path ${CKPT_PATH} \
42 |     --deepspeed ./scripts/zero3.json \
43 |     --version plain \
44 |     --data_path ${DATA_PATH} \
45 |     --image_folder ${IMAGE_PATH} \
46 |     --vision_tower ${VIT_PATH} \
47 |     --vision_tower_clip ${VIT_PATH_CLIP} \
48 |     --requires_cliploss True \
49 |     --tune_vision_tower True \
50 |     --mm_projector_type mlp2x_gelu \
51 |     --mm_vision_select_layer -2 \
52 |     --mm_use_im_start_end False \
53 |     --mm_use_im_patch_token False \
54 |     --bf16 True \
55 |     --output_dir checkpoints/${SAVE_PATH} \
56 |     --num_train_epochs 1 \
57 |     --per_device_train_batch_size 16 \
58 |     --per_device_eval_batch_size 4 \
59 |     --gradient_accumulation_steps 1 \
60 |     --evaluation_strategy "no" \
61 |     --save_strategy "steps" \
62 |     --save_steps 7813 \
63 |     --save_total_limit 10 \
64 |     --learning_rate ${BASE_LR} \
65 |     --mm_projector_lr ${LEARNIG_RATE} \
66 |     --vision_tower_lr ${LEARNIG_RATE} \
67 |     --weight_decay 0. \
68 |     --warmup_ratio 0.03 \
69 |     --lr_scheduler_type "cosine" \
70 |     --logging_steps 1 \
71 |     --tf32 True \
72 |     --model_max_length 2048 \
73 |     --gradient_checkpointing True \
74 |     --dataloader_num_workers 4 \
75 |     --lazy_preprocess True \
76 |     --report_to wandb \
77 |     --run_name ${SAVE_PATH} \
78 |     2>&1 | tee logs/${SAVE_PATH}-rank$1-$(date "+%Y-%m-%d|%H:%M:%S").log


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/eve7b_pretrain.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | 
11 | set -x
12 | 
13 | wandb login
14 | apt-get install -y libibverbs1
15 | mkdir -p logs
16 | 
17 | export CUDA_DEVICE_MAX_CONNECTIONS=1
18 | export GPUS_PER_NODE=8
19 | export NNODES=4
20 | export MASTER_PORT=23456
21 | export CPUS_PER_TASK=32
22 | export QUOTA=reserved
23 | 
24 | # We have no specific plan to release pretraining data. 
25 | # However, you can download and filter images according to our paper's guidelines. 
26 | # Then, use LLaVA-NEXT to generate high-definition image descriptions, which would provide better results.
27 | 
28 | export DATA_PATH=playground/data/EVE-Pretrain-33M/eve_pretrain_cap33m.json
29 | export IMAGE_PATH=playground/data/EVE-Pretrain-33M
30 | export VIT_PATH=openai/eve-patch14-anypixel-672
31 | export VIT_PATH_CLIP=openai/clip-vit-large-patch14-336
32 | export BASE_LR=4e-5
33 | 
34 | export CKPT_PATH=checkpoints/eve-7b-prtr0-672-mse (or EVE-7B-Prealign-v1.0)
35 | export SAVE_PATH=eve-7b-prtr1-672-mse (or EVE-7B-Pretrain-v1.0)
36 | 
37 | 
38 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
39 |     eve/train/train_mem.py \
40 |     --model_name_or_path ${CKPT_PATH} \
41 |     --deepspeed ./scripts/zero3.json \
42 |     --version plain \
43 |     --data_path ${DATA_PATH} \
44 |     --image_folder ${IMAGE_PATH} \
45 |     --vision_tower ${VIT_PATH} \
46 |     --vision_tower_clip ${VIT_PATH_CLIP} \
47 |     --requires_cliploss True \
48 |     --mm_projector_type mlp2x_gelu \
49 |     --mm_vision_select_layer -2 \
50 |     --mm_use_im_start_end False \
51 |     --mm_use_im_patch_token False \
52 |     --bf16 True \
53 |     --output_dir checkpoints/${SAVE_PATH} \
54 |     --num_train_epochs 1 \
55 |     --per_device_train_batch_size 16 \
56 |     --per_device_eval_batch_size 4 \
57 |     --gradient_accumulation_steps 1 \
58 |     --evaluation_strategy "no" \
59 |     --save_strategy "steps" \
60 |     --save_steps 7813 \
61 |     --save_total_limit 10 \
62 |     --learning_rate ${BASE_LR} \
63 |     --weight_decay 0. \
64 |     --warmup_ratio 0.01 \
65 |     --lr_scheduler_type "cosine" \
66 |     --logging_steps 1 \
67 |     --tf32 True \
68 |     --model_max_length 2048 \
69 |     --gradient_checkpointing True \
70 |     --dataloader_num_workers 4 \
71 |     --lazy_preprocess True \
72 |     --report_to wandb \
73 |     --run_name ${SAVE_PATH} \
74 |     2>&1 | tee logs/${SAVE_PATH}-rank$1-$(date "+%Y-%m-%d|%H:%M:%S").log


--------------------------------------------------------------------------------
/EVEv1/scripts/eve/test_all_benchmark.sh:
--------------------------------------------------------------------------------
 1 | CKPT_NAME='EVE-7B-HD-v1.0'
 2 | CKPT_PATH='BAAI'
 3 | mkdir -p log_results
 4 | 
 5 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/seed.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_seed
 6 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/gqa.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_gqa
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/sqa.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_sqa
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/vqav2.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_vqav2
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/textvqa.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_textvqa
10 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/mmbench_en.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_mmbench_en
11 | CUDA_VISIBLE_DEVICES=0 bash scripts/eve/eval/mme.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_mme
12 | CUDA_VISIBLE_DEVICES=1 bash scripts/eve/eval/mmvet.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_mmvet
13 | CUDA_VISIBLE_DEVICES=2 bash scripts/eve/eval/pope.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_pope
14 | CUDA_VISIBLE_DEVICES=3 bash scripts/eve/eval/vizwiz.sh ${CKPT_NAME} ${CKPT_PATH}  2>&1 | tee log_results/${CKPT_NAME}_vizwiz
15 | 


--------------------------------------------------------------------------------
/EVEv1/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/EVEv1/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/EVEv1/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/EVEv2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 BAAI-Vision
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/EVEv2/docs/Data.md:
--------------------------------------------------------------------------------
1 | ## Data Composition
2 | 
3 | The implementational code, model structure, and weights of our caption engine DenseFusion++ will be updated soon!
4 | 
5 | Related Project for Acknowledgments: [DenseFusion](https://github.com/baaivision/DenseFusion)
6 | 


--------------------------------------------------------------------------------
/EVEv2/eve/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/eve/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/eve/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import EVELlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/EVEv2/eve/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_gpt_review.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | import openai
  7 | import ray
  8 | import tqdm
  9 | 
 10 | NUM_SECONDS_TO_SLEEP = 3
 11 | 
 12 | 
 13 | @ray.remote(num_cpus=4)
 14 | def get_eval(content: str, max_tokens: int):
 15 |     while True:
 16 |         try:
 17 |             response = openai.ChatCompletion.create(
 18 |                 model='gpt-4',
 19 |                 messages=[{
 20 |                     'role': 'system',
 21 |                     'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
 22 |                 }, {
 23 |                     'role': 'user',
 24 |                     'content': content,
 25 |                 }],
 26 |                 temperature=0.2,  # TODO: figure out which temperature is best for evaluation
 27 |                 max_tokens=max_tokens,
 28 |             )
 29 |             break
 30 |         except openai.error.RateLimitError:
 31 |             pass
 32 |         except Exception as e:
 33 |             print(e)
 34 |         time.sleep(NUM_SECONDS_TO_SLEEP)
 35 | 
 36 |     print('success!')
 37 |     return response['choices'][0]['message']['content']
 38 | 
 39 | 
 40 | def parse_score(review):
 41 |     try:
 42 |         score_pair = review.split('\n')[0]
 43 |         score_pair = score_pair.replace(',', ' ')
 44 |         sp = score_pair.split(' ')
 45 |         if len(sp) == 2:
 46 |             return [float(sp[0]), float(sp[1])]
 47 |         else:
 48 |             print('error', review)
 49 |             return [-1, -1]
 50 |     except Exception as e:
 51 |         print(e)
 52 |         print('error', review)
 53 |         return [-1, -1]
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 |     parser = argparse.ArgumentParser(
 58 |         description='ChatGPT-based QA evaluation.')
 59 |     parser.add_argument('-q', '--question')
 60 |     # parser.add_argument('-a', '--answer')
 61 |     parser.add_argument('-a', '--answer-list', nargs='+', default=[])
 62 |     parser.add_argument('-r', '--rule')
 63 |     parser.add_argument('-o', '--output')
 64 |     parser.add_argument('--max-tokens', type=int, default=1024,
 65 |                         help='maximum number of tokens produced in the output')
 66 |     args = parser.parse_args()
 67 | 
 68 |     ray.init()
 69 | 
 70 |     f_q = open(os.path.expanduser(args.question))
 71 |     f_ans1 = open(os.path.expanduser(args.answer_list[0]))
 72 |     f_ans2 = open(os.path.expanduser(args.answer_list[1]))
 73 |     rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
 74 | 
 75 |     review_file = open(f'{args.output}', 'w')
 76 | 
 77 |     js_list = []
 78 |     handles = []
 79 |     idx = 0
 80 |     for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
 81 |         # if idx == 1:
 82 |         #     break
 83 | 
 84 |         ques = json.loads(ques_js)
 85 |         ans1 = json.loads(ans1_js)
 86 |         ans2 = json.loads(ans2_js)
 87 | 
 88 |         category = json.loads(ques_js)['category']
 89 |         if category in rule_dict:
 90 |             rule = rule_dict[category]
 91 |         else:
 92 |             rule = rule_dict['default']
 93 |         prompt = rule['prompt']
 94 |         role = rule['role']
 95 |         content = (f'[Question]\n{ques["text"]}\n\n'
 96 |                    f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
 97 |                    f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
 98 |                    f'[System]\n{prompt}\n\n')
 99 |         js_list.append({
100 |             'id': idx+1,
101 |             'question_id': ques['question_id'],
102 |             'answer1_id': ans1['answer_id'],
103 |             'answer2_id': ans2['answer_id'],
104 |             'category': category})
105 |         idx += 1
106 |         handles.append(get_eval.remote(content, args.max_tokens))
107 |         # To avoid the rate limit set by OpenAI
108 |         time.sleep(NUM_SECONDS_TO_SLEEP)
109 | 
110 |     reviews = ray.get(handles)
111 |     for idx, review in enumerate(reviews):
112 |         scores = parse_score(review)
113 |         js_list[idx]['content'] = review
114 |         js_list[idx]['tuple'] = scores
115 |         review_file.write(json.dumps(js_list[idx]) + '\n')
116 |     review_file.close()
117 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_one_sample.py:
--------------------------------------------------------------------------------
 1 | from eve.eval.run_eve import eval_model
 2 | 
 3 | model_path = "Absolute Path of BAAI/EVE-7B-HD-v2.0"
 4 | model_type = 'qwen2'
 5 | conv_mode = 'qwen2'
 6 | 
 7 | prompt = "Please describe the image in detail."
 8 | image_file = "examples/ocr_beijing.jpg"
 9 | 
10 | args = type('Args', (), {
11 |     "model_path": model_path,
12 |     "model_type": model_type,
13 |     "query": prompt,
14 |     "conv_mode": conv_mode,
15 |     "image_file": image_file,
16 |     "temperature": 0.2,
17 |     "do_sample": True,
18 |     "top_p": None,
19 |     "top_k": None,
20 |     "num_beams": 1,
21 |     "max_new_tokens": 512
22 | })()
23 | 
24 | eval_model(args)


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_pope.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | 
 6 | def eval_pope(answers, label_file):
 7 |     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 8 | 
 9 |     for answer in answers:
10 |         text = answer['text']
11 | 
12 |         # Only keep the first sentence
13 |         if text.find('.') != -1:
14 |             text = text.split('.')[0]
15 | 
16 |         text = text.replace(',', '')
17 |         words = text.split(' ')
18 |         if 'No' in words or 'not' in words or 'no' in words:
19 |             answer['text'] = 'no'
20 |         else:
21 |             answer['text'] = 'yes'
22 | 
23 |     for i in range(len(label_list)):
24 |         if label_list[i] == 'no':
25 |             label_list[i] = 0
26 |         else:
27 |             label_list[i] = 1
28 | 
29 |     pred_list = []
30 |     for answer in answers:
31 |         if answer['text'] == 'no':
32 |             pred_list.append(0)
33 |         else:
34 |             pred_list.append(1)
35 | 
36 |     pos = 1
37 |     neg = 0
38 |     yes_ratio = pred_list.count(1) / len(pred_list)
39 | 
40 |     TP, TN, FP, FN = 0, 0, 0, 0
41 |     for pred, label in zip(pred_list, label_list):
42 |         if pred == pos and label == pos:
43 |             TP += 1
44 |         elif pred == pos and label == neg:
45 |             FP += 1
46 |         elif pred == neg and label == neg:
47 |             TN += 1
48 |         elif pred == neg and label == pos:
49 |             FN += 1
50 | 
51 |     print('TP\tFP\tTN\tFN\t')
52 |     print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
53 | 
54 |     precision = float(TP) / float(TP + FP)
55 |     recall = float(TP) / float(TP + FN)
56 |     f1 = 2*precision*recall / (precision + recall)
57 |     acc = (TP + TN) / (TP + TN + FP + FN)
58 |     print('Accuracy: {}'.format(acc))
59 |     print('Precision: {}'.format(precision))
60 |     print('Recall: {}'.format(recall))
61 |     print('F1 score: {}'.format(f1))
62 |     print('Yes ratio: {}'.format(yes_ratio))
63 |     print('%.3f, %.3f, %.3f, %.3f, %.3f' %
64 |           (f1, acc, precision, recall, yes_ratio))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument("--annotation-dir", type=str)
70 |     parser.add_argument("--question-file", type=str)
71 |     parser.add_argument("--result-file", type=str)
72 |     args = parser.parse_args()
73 | 
74 |     questions = [json.loads(line) for line in open(args.question_file)]
75 |     questions = {question['question_id']: question for question in questions}
76 |     answers = [json.loads(q) for q in open(args.result_file)]
77 |     for file in os.listdir(args.annotation_dir):
78 |         assert file.startswith('coco_pope_')
79 |         assert file.endswith('.json')
80 |         category = file[10:-5]
81 |         cur_answers = [
82 |             x for x in answers if questions[x['question_id']]['category'] == category]
83 |         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
84 |         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
85 |         print("====================================")
86 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_science_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | 
  7 | 
  8 | def get_args():
  9 |     parser = argparse.ArgumentParser()
 10 |     parser.add_argument('--base-dir', type=str)
 11 |     parser.add_argument('--result-file', type=str)
 12 |     parser.add_argument('--output-file', type=str)
 13 |     parser.add_argument('--output-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return -1
 36 |         return random.choice(range(len(choices)))
 37 | 
 38 | 
 39 | if __name__ == "__main__":
 40 |     args = get_args()
 41 | 
 42 |     base_dir = args.base_dir
 43 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 44 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 45 |     predictions = [json.loads(line) for line in open(args.result_file)]
 46 |     predictions = {pred['question_id']: pred for pred in predictions}
 47 |     split_problems = {idx: problems[idx] for idx in split_indices}
 48 | 
 49 |     results = {'correct': [], 'incorrect': []}
 50 |     sqa_results = {}
 51 |     sqa_results['acc'] = None
 52 |     sqa_results['correct'] = None
 53 |     sqa_results['count'] = None
 54 |     sqa_results['results'] = {}
 55 |     sqa_results['outputs'] = {}
 56 | 
 57 |     for prob_id, prob in split_problems.items():
 58 |         if prob_id not in predictions:
 59 |             pred = {'text': 'FAILED', 'prompt': 'Unknown'}
 60 |             pred_text = 'FAILED'
 61 |         else:
 62 |             pred = predictions[prob_id]
 63 |             pred_text = pred['text']
 64 | 
 65 |         if pred_text in args.options:
 66 |             answer = pred_text
 67 |         elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
 68 |             answer = pred_text[0]
 69 |         else:
 70 |             pattern = re.compile(r'The answer is ([A-Z]).')
 71 |             res = pattern.findall(pred_text)
 72 |             if len(res) == 1:
 73 |                 answer = res[0]  # 'A', 'B', ...
 74 |             else:
 75 |                 answer = "FAILED"
 76 | 
 77 |         pred_idx = get_pred_idx(answer, prob['choices'], args.options)
 78 | 
 79 |         analysis = {
 80 |             'question_id': prob_id,
 81 |             'parsed_ans': answer,
 82 |             'ground_truth': args.options[prob['answer']],
 83 |             'question': pred['prompt'],
 84 |             'pred': pred_text,
 85 |             'is_multimodal': '<image>' in pred['prompt'],
 86 |         }
 87 | 
 88 |         sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
 89 |         sqa_results['outputs'][prob_id] = pred_text
 90 | 
 91 |         if pred_idx == prob['answer']:
 92 |             results['correct'].append(analysis)
 93 |         else:
 94 |             results['incorrect'].append(analysis)
 95 | 
 96 |     correct = len(results['correct'])
 97 |     total = len(results['correct']) + len(results['incorrect'])
 98 | 
 99 |     ###### IMG ######
100 |     multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101 |     multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102 |     multimodal_total = multimodal_correct + multimodal_incorrect
103 |     ###### IMG ######
104 | 
105 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106 | 
107 |     sqa_results['acc'] = correct / total * 100
108 |     sqa_results['correct'] = correct
109 |     sqa_results['count'] = total
110 | 
111 |     with open(args.output_file, 'w') as f:
112 |         json.dump(results, f, indent=2)
113 |     with open(args.output_result, 'w') as f:
114 |         json.dump(sqa_results, f, indent=2)
115 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_science_qa_gpt4.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import re
  5 | import random
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--base-dir', type=str)
 12 |     parser.add_argument('--gpt4-result', type=str)
 13 |     parser.add_argument('--our-result', type=str)
 14 |     parser.add_argument('--split', type=str, default='test')
 15 |     parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
 16 |     return parser.parse_args()
 17 | 
 18 | 
 19 | def convert_caps(results):
 20 |     fakecaps = []
 21 |     for result in results:
 22 |         image_id = result['question_id']
 23 |         caption = result['text']
 24 |         fakecaps.append({"image_id": int(image_id), "caption": caption})
 25 |     return fakecaps
 26 | 
 27 | 
 28 | def get_pred_idx(prediction, choices, options):
 29 |     """
 30 |     Get the index (e.g. 2) from the prediction (e.g. 'C')
 31 |     """
 32 |     if prediction in options[:len(choices)]:
 33 |         return options.index(prediction)
 34 |     else:
 35 |         return random.choice(range(len(choices)))
 36 | 
 37 | 
 38 | if __name__ == "__main__":
 39 |     args = get_args()
 40 | 
 41 |     base_dir = args.base_dir
 42 |     split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
 43 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
 44 |     our_predictions = [json.loads(line) for line in open(args.our_result)]
 45 |     our_predictions = {pred['question_id']: pred for pred in our_predictions}
 46 |     split_problems = {idx: problems[idx] for idx in split_indices}
 47 | 
 48 |     gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
 49 | 
 50 |     results = defaultdict(lambda: 0)
 51 | 
 52 |     for prob_id, prob in split_problems.items():
 53 |         if prob_id not in our_predictions:
 54 |             continue
 55 |         if prob_id not in gpt4_predictions:
 56 |             continue
 57 |         our_pred = our_predictions[prob_id]['text']
 58 |         gpt4_pred = gpt4_predictions[prob_id]
 59 | 
 60 |         pattern = re.compile(r'The answer is ([A-Z]).')
 61 |         our_res = pattern.findall(our_pred)
 62 |         if len(our_res) == 1:
 63 |             our_answer = our_res[0]  # 'A', 'B', ...
 64 |         else:
 65 |             our_answer = "FAILED"
 66 |         gpt4_res = pattern.findall(gpt4_pred)
 67 |         if len(gpt4_res) == 1:
 68 |             gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
 69 |         else:
 70 |             gpt4_answer = "FAILED"
 71 | 
 72 |         our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
 73 |         gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
 74 | 
 75 |         if gpt4_answer == 'FAILED':
 76 |             results['gpt4_failed'] += 1
 77 |             # continue
 78 |             gpt4_pred_idx = our_pred_idx
 79 |             # if our_pred_idx != prob['answer']:
 80 |             #     print(our_predictions[prob_id]['prompt'])
 81 |             #     print('-----------------')
 82 |             #     print(f'LECTURE: {prob["lecture"]}')
 83 |             #     print(f'SOLUTION: {prob["solution"]}')
 84 |             #     print('=====================')
 85 |         else:
 86 |             # continue
 87 |             pass
 88 |         # gpt4_pred_idx = our_pred_idx
 89 | 
 90 |         if gpt4_pred_idx == prob['answer']:
 91 |             results['correct'] += 1
 92 |         else:
 93 |             results['incorrect'] += 1
 94 | 
 95 | 
 96 |         if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
 97 |             results['correct_upperbound'] += 1
 98 | 
 99 |     correct = results['correct']
100 |     total = results['correct'] + results['incorrect']
101 |     print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102 |     print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103 |     print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104 | 
105 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/eval_textvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import re
 5 | 
 6 | from eve.eval.m4c_evaluator import TextVQAAccuracyEvaluator
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--annotation-file', type=str)
12 |     parser.add_argument('--result-file', type=str)
13 |     parser.add_argument('--result-dir', type=str)
14 |     return parser.parse_args()
15 | 
16 | 
17 | def prompt_processor(prompt):
18 |     if prompt.startswith('OCR tokens: '):
19 |         pattern = r"Question: (.*?) Short answer:"
20 |         match = re.search(pattern, prompt, re.DOTALL)
21 |         question = match.group(1)
22 |     elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23 |         if prompt.startswith('Reference OCR token:'):
24 |             question = prompt.split('\n')[1]
25 |         else:
26 |             question = prompt.split('\n')[0]
27 |     elif len(prompt.split('\n')) == 2:
28 |         question = prompt.split('\n')[0]
29 |     else:
30 |         assert False
31 | 
32 |     return question.lower()
33 | 
34 | 
35 | def eval_single(annotation_file, result_file):
36 |     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37 |     print(experiment_name)
38 |     annotations = json.load(open(annotation_file))['data']
39 |     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40 |     results = [json.loads(line) for line in open(result_file)]
41 | 
42 |     pred_list = []
43 |     for result in results:
44 |         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45 |         pred_list.append({
46 |             "pred_answer": result['text'],
47 |             "gt_answers": annotation['answers'],
48 |         })
49 | 
50 |     evaluator = TextVQAAccuracyEvaluator()
51 |     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = get_args()
56 | 
57 |     if args.result_file is not None:
58 |         eval_single(args.annotation_file, args.result_file)
59 | 
60 |     if args.result_dir is not None:
61 |         for result_file in sorted(os.listdir(args.result_dir)):
62 |             if not result_file.endswith('.jsonl'):
63 |                 print(f'Skipping {result_file}')
64 |                 continue
65 |             eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
66 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/model_qa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import shortuuid
  6 | import torch
  7 | from tqdm import tqdm
  8 | from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria
  9 | 
 10 | from eve.conversation import conv_templates
 11 | from eve.utils import disable_torch_init
 12 | from eve.mm_utils import get_model_name_from_path
 13 | 
 14 | 
 15 | # new stopping implementation
 16 | class KeywordsStoppingCriteria(StoppingCriteria):
 17 |     def __init__(self, keywords, tokenizer, input_ids):
 18 |         self.keywords = keywords
 19 |         self.tokenizer = tokenizer
 20 |         self.start_len = None
 21 |         self.input_ids = input_ids
 22 | 
 23 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 24 |         if self.start_len is None:
 25 |             self.start_len = self.input_ids.shape[1]
 26 |         else:
 27 |             outputs = self.tokenizer.batch_decode(
 28 |                 output_ids[:, self.start_len:], skip_special_tokens=True)[0]
 29 |             for keyword in self.keywords:
 30 |                 if keyword in outputs:
 31 |                     return True
 32 |         return False
 33 | 
 34 | 
 35 | @torch.inference_mode()
 36 | def eval_model(model_path, questions_file, answers_file):
 37 |     # Model
 38 |     disable_torch_init()
 39 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
 40 |     model = AutoModelForCausalLM.from_pretrained(model_path,
 41 |                                                  torch_dtype=torch.float16).cuda()
 42 | 
 43 |     ques_file = open(os.path.expanduser(questions_file), "r")
 44 |     ans_file = open(os.path.expanduser(answers_file), "w")
 45 |     for i, line in enumerate(tqdm(ques_file)):
 46 |         idx = json.loads(line)["question_id"]
 47 |         qs = json.loads(line)["text"]
 48 |         cat = json.loads(line)["category"]
 49 |         conv = conv_templates[args.conv_mode].copy()
 50 |         conv.append_message(conv.roles[0], qs)
 51 |         prompt = conv.get_prompt()
 52 |         inputs = tokenizer([prompt])
 53 |         input_ids = torch.as_tensor(inputs.input_ids).cuda()
 54 |         stopping_criteria = KeywordsStoppingCriteria(
 55 |             [conv.sep], tokenizer, input_ids)
 56 |         output_ids = model.generate(
 57 |             input_ids,
 58 |             do_sample=args.do_sample,
 59 |             top_p=args.top_p,
 60 |             top_k=args.top_k,
 61 |             num_beams=args.num_beams,
 62 |             temperature=args.temperature,
 63 |             use_cache=True,
 64 |             max_new_tokens=args.max_new_tokens,
 65 |             stopping_criteria=[stopping_criteria])
 66 |         outputs = tokenizer.batch_decode(
 67 |             output_ids, skip_special_tokens=True)[0]
 68 |         try:
 69 |             index = outputs.index(conv.sep, len(prompt))
 70 |         except ValueError:
 71 |             outputs += conv.sep
 72 |             index = outputs.index(conv.sep, len(prompt))
 73 | 
 74 |         outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
 75 |         ans_id = shortuuid.uuid()
 76 |         ans_file.write(json.dumps({"question_id": idx,
 77 |                                    "text": outputs,
 78 |                                    "answer_id": ans_id,
 79 |                                    "model_id": get_model_name_from_path(model_path),
 80 |                                    "metadata": {}}) + "\n")
 81 |         ans_file.flush()
 82 |     ans_file.close()
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("--model-path", type=str, default=None)
 88 |     parser.add_argument("--conv-mode", type=str, default=None)
 89 |     parser.add_argument("--question-file", type=str, default=None)
 90 |     parser.add_argument("--answers-file", type=str, default=None)
 91 |     parser.add_argument("--temperature", type=float, default=None)
 92 |     parser.add_argument("--top_p", type=float, default=None)
 93 |     parser.add_argument("--top_k", type=float, default=None)
 94 |     parser.add_argument("--num_beams", type=int, default=1)
 95 |     parser.add_argument("--do_sample", type=bool, default=False)
 96 |     parser.add_argument("--max_new_tokens", type=int, default=1024)
 97 |     args = parser.parse_args()
 98 |     exit()
 99 |     eval_model(args.model_path, args.question_file, args.answers_file)
100 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/qa_baseline_gpt35.py:
--------------------------------------------------------------------------------
 1 | """Generate answers with GPT-3.5"""
 2 | # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
 3 | import argparse
 4 | import concurrent.futures
 5 | import json
 6 | import os
 7 | import time
 8 | 
 9 | import openai
10 | import shortuuid
11 | import tqdm
12 | 
13 | openai.api_key = 'xxx' # replace with your own key
14 | MODEL = 'gpt-3.5-turbo'
15 | MODEL_ID = 'gpt-3.5-turbo:20230327'
16 | 
17 | 
18 | def get_answer(question_id: int, question: str, max_tokens: int):
19 |     ans = {
20 |         'answer_id': shortuuid.uuid(),
21 |         'question_id': question_id,
22 |         'model_id': MODEL_ID,
23 |     }
24 |     for _ in range(3):
25 |         try:
26 |             response = openai.ChatCompletion.create(
27 |                 model=MODEL,
28 |                 messages=[{
29 |                     'role': 'system',
30 |                     'content': 'You are a helpful assistant.'
31 |                 }, {
32 |                     'role': 'user',
33 |                     'content': question,
34 |                 }],
35 |                 temperature=0,
36 |                 max_tokens=max_tokens,
37 |             )
38 |             ans['text'] = response['choices'][0]['message']['content']
39 |             return ans
40 |         except Exception as e:
41 |             print('[ERROR]', e)
42 |             ans['text'] = '#ERROR#'
43 |             time.sleep(1)
44 |     return ans
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
49 |     parser.add_argument('-q', '--question')
50 |     parser.add_argument('-o', '--output')
51 |     parser.add_argument('--max-tokens', type=int, default=1024,
52 |                         help='maximum number of tokens produced in the output')
53 |     args = parser.parse_args()
54 | 
55 |     questions_dict = {}
56 |     with open(os.path.expanduser(args.question)) as f:
57 |         for line in f:
58 |             if not line:
59 |                 continue
60 |             q = json.loads(line)
61 |             questions_dict[q['question_id']] = q['text']
62 | 
63 |     answers = []
64 | 
65 |     with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
66 |         futures = []
67 |         for qid, question in questions_dict.items():
68 |             future = executor.submit(
69 |                 get_answer, qid, question, args.max_tokens)
70 |             futures.append(future)
71 | 
72 |         for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
73 |             answers.append(future.result())
74 | 
75 |     answers.sort(key=lambda x: x['question_id'])
76 | 
77 |     with open(os.path.expanduser(args.output), 'w') as f:
78 |         table = [json.dumps(ans) for ans in answers]
79 |         f.write('\n'.join(table))
80 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/run_eve.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from io import BytesIO
 3 | 
 4 | import requests
 5 | import torch
 6 | from PIL import Image
 7 | 
 8 | from eve.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
 9 |                                DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
10 | from eve.conversation import SeparatorStyle, conv_templates
11 | from eve.mm_utils import (KeywordsStoppingCriteria, process_images, tokenizer_image_token)
12 | from eve.model.builder import load_pretrained_model
13 | from eve.utils import disable_torch_init
14 | 
15 | 
16 | def load_image(image_file):
17 |     if image_file.startswith('http') or image_file.startswith('https'):
18 |         response = requests.get(image_file)
19 |         image = Image.open(BytesIO(response.content)).convert('RGB')
20 |     else:
21 |         image = Image.open(image_file).convert('RGB')
22 |     return image
23 | 
24 | 
25 | def eval_model(args):
26 |     # Model
27 |     disable_torch_init()
28 | 
29 |     tokenizer, model, image_processor, context_len = load_pretrained_model(
30 |         args.model_path, args.model_type)
31 | 
32 |     qs = args.query
33 |     qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
34 | 
35 |     conv = conv_templates[args.conv_mode].copy()
36 |     conv.append_message(conv.roles[0], qs)
37 |     conv.append_message(conv.roles[1], None)
38 |     prompt = conv.get_prompt()
39 | 
40 |     image = load_image(args.image_file)
41 |     image_tensor = process_images([image], image_processor, None)[0]
42 |     image_tensor = image_tensor.unsqueeze(0).half().cuda()
43 | 
44 |     input_ids = tokenizer_image_token(
45 |         prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
46 |     input_ids = input_ids.to(device='cuda', non_blocking=True)
47 | 
48 |     stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
49 |     keywords = [stop_str]
50 |     stopping_criteria = KeywordsStoppingCriteria(
51 |         keywords, tokenizer, input_ids)
52 | 
53 |     with torch.inference_mode():
54 |         output_ids = model.generate(
55 |             input_ids,
56 |             images=image_tensor,
57 |             do_sample=args.do_sample,
58 |             top_p=args.top_p,
59 |             top_k=args.top_k,
60 |             num_beams=args.num_beams,
61 |             temperature=args.temperature,
62 |             use_cache=True,
63 |             max_new_tokens=args.max_new_tokens,
64 |             stopping_criteria=[stopping_criteria])
65 | 
66 |     input_token_len = input_ids.shape[1]
67 |     n_diff_input_output = (
68 |         input_ids != output_ids[:, :input_token_len]).sum().item()
69 |     if n_diff_input_output > 1:
70 |         print(
71 |             f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
72 |     outputs = tokenizer.batch_decode(
73 |         output_ids[:, input_token_len:], skip_special_tokens=True)[0]
74 |     outputs = outputs.strip()
75 |     if outputs.endswith(stop_str):
76 |         outputs = outputs[:-len(stop_str)]
77 |     outputs = outputs.strip()
78 |     print(outputs)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     parser = argparse.ArgumentParser()
83 |     parser.add_argument("--model-path", type=str, default=None)
84 |     parser.add_argument("--model-type", type=str, default=None)
85 |     parser.add_argument("--image-file", type=str, required=True)
86 |     parser.add_argument("--query", type=str, required=True)
87 |     parser.add_argument("--conv-mode", type=str, default=None)
88 |     parser.add_argument("--temperature", type=float, default=None)
89 |     parser.add_argument("--top_p", type=float, default=None)
90 |     parser.add_argument("--top_k", type=float, default=None)
91 |     parser.add_argument("--num_beams", type=int, default=1)
92 |     parser.add_argument("--do_sample", type=bool, default=False)
93 |     parser.add_argument("--max_new_tokens", type=int, default=512)
94 |     args = parser.parse_args()
95 | 
96 |     eval_model(args)
97 | 


--------------------------------------------------------------------------------
/EVEv2/eve/eval/summarize_gpt_review.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | from collections import defaultdict
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser(
11 |         description='ChatGPT-based QA evaluation.')
12 |     parser.add_argument('-d', '--dir', default=None)
13 |     parser.add_argument('-v', '--version', default=None)
14 |     parser.add_argument('-s', '--select', nargs='*', default=None)
15 |     parser.add_argument('-f', '--files', nargs='*', default=[])
16 |     parser.add_argument('-i', '--ignore', nargs='*', default=[])
17 |     return parser.parse_args()
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     args = parse_args()
22 | 
23 |     if args.ignore is not None:
24 |         args.ignore = [int(x) for x in args.ignore]
25 | 
26 |     if len(args.files) > 0:
27 |         review_files = args.files
28 |     else:
29 |         review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith(
30 |             'gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
31 | 
32 |     for review_file in sorted(review_files):
33 |         config = os.path.basename(review_file).replace(
34 |             'gpt4_text_', '').replace('.jsonl', '')
35 |         if args.select is not None and any(x not in config for x in args.select):
36 |             continue
37 |         if '0613' in config:
38 |             version = '0613'
39 |         else:
40 |             version = '0314'
41 |         if args.version is not None and args.version != version:
42 |             continue
43 |         scores = defaultdict(list)
44 |         print(config)
45 |         with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
46 |             for review_str in f:
47 |                 review = json.loads(review_str)
48 |                 if review['question_id'] in args.ignore:
49 |                     continue
50 |                 if 'category' in review:
51 |                     scores[review['category']].append(review['tuple'])
52 |                     scores['all'].append(review['tuple'])
53 |                 else:
54 |                     if 'tuple' in review:
55 |                         scores['all'].append(review['tuple'])
56 |                     else:
57 |                         scores['all'].append(review['score'])
58 |         for k, v in sorted(scores.items()):
59 |             stats = np.asarray(v).mean(0).tolist()
60 |             stats = [round(x, 3) for x in stats]
61 |             # print(k, stats, round(stats[1]/stats[0]*100, 1))
62 |             print(k, round(stats[1]/stats[0]*100, 1),
63 |                   round(stats[0] * 10, 1), round(stats[1] * 10, 1))
64 |         print('=================================')
65 | 


--------------------------------------------------------------------------------
/EVEv2/eve/mm_utils.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from io import BytesIO
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | import math
 7 | from transformers import StoppingCriteria
 8 | 
 9 | from eve.constants import IMAGE_TOKEN_INDEX
10 | 
11 | 
12 | def load_image_from_base64(image):
13 |     return Image.open(BytesIO(base64.b64decode(image)))
14 | 
15 | 
16 | def process_images(images, processor, model_cfg=None):
17 |     new_images = []
18 |     for raw_image in images:
19 |         width, height = raw_image.size
20 |         scale_ratio = math.sqrt(width * height) / processor.max_size
21 |         min_edge = processor.patch_stride * processor.dense_stride
22 | 
23 |         width = max(int(math.ceil(width / scale_ratio / min_edge)) * min_edge, min_edge)
24 |         height = max(int(math.ceil(height / scale_ratio / min_edge)) * min_edge, min_edge)
25 | 
26 |         new_image = raw_image.resize((width, height))
27 |         image = processor.preprocess(new_image, return_tensors='pt')['pixel_values'][0]
28 |         new_images.append(image)
29 | 
30 |     if all(x.shape == new_images[0].shape for x in new_images):
31 |         new_images = torch.stack(new_images, dim=0)
32 |     return new_images
33 | 
34 | 
35 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
36 |     prompt_chunks = [
37 |         tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
38 | 
39 |     def insert_separator(X, sep):
40 |         return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
41 | 
42 |     input_ids = []
43 |     offset = 0
44 |     if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
45 |         offset = 1
46 |         input_ids.append(prompt_chunks[0][0])
47 | 
48 |     for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
49 |         input_ids.extend(x[offset:])
50 | 
51 |     if return_tensors is not None:
52 |         if return_tensors == 'pt':
53 |             return torch.tensor(input_ids, dtype=torch.long)
54 |         raise ValueError(f'Unsupported tensor type: {return_tensors}')
55 |     return input_ids
56 | 
57 | 
58 | def get_model_name_from_path(model_path):
59 |     model_path = model_path.strip("/")
60 |     model_paths = model_path.split("/")
61 |     if model_paths[-1].startswith('checkpoint-'):
62 |         return model_paths[-2] + "_" + model_paths[-1]
63 |     else:
64 |         return model_paths[-1].split('_')[0]
65 | 
66 | 
67 | class KeywordsStoppingCriteria(StoppingCriteria):
68 |     def __init__(self, keywords, tokenizer, input_ids):
69 |         self.keywords = keywords
70 |         self.keyword_ids = []
71 |         self.max_keyword_len = 0
72 |         for keyword in keywords:
73 |             cur_keyword_ids = tokenizer(keyword).input_ids
74 |             if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
75 |                 cur_keyword_ids = cur_keyword_ids[1:]
76 |             if len(cur_keyword_ids) > self.max_keyword_len:
77 |                 self.max_keyword_len = len(cur_keyword_ids)
78 |             self.keyword_ids.append(torch.tensor(cur_keyword_ids))
79 |         self.tokenizer = tokenizer
80 |         self.start_len = input_ids.shape[1]
81 | 
82 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
83 |         # TODO
84 |         assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"
85 |         offset = min(output_ids.shape[1] -
86 |                      self.start_len, self.max_keyword_len)
87 |         self.keyword_ids = [keyword_id.to(
88 |             output_ids.device) for keyword_id in self.keyword_ids]
89 |         for keyword_id in self.keyword_ids:
90 |             if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
91 |                 return True
92 |         outputs = self.tokenizer.batch_decode(
93 |             output_ids[:, -offset:], skip_special_tokens=True)[0]
94 |         for keyword in self.keywords:
95 |             if keyword in outputs:
96 |                 return True
97 |         return False
98 | 


--------------------------------------------------------------------------------
/EVEv2/eve/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.eve_llama3 import (EVELlamaConfig, EVELlamaForCausalLM)
2 | from .language_model.eve_qwen2 import (EVEQwen2Config, EVEQwen2ForCausalLM)
3 | 


--------------------------------------------------------------------------------
/EVEv2/eve/model/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import warnings
 3 | 
 4 | import torch
 5 | from transformers import AutoTokenizer
 6 |                            
 7 | from eve.model import *
 8 | 
 9 | 
10 | def load_pretrained_model(model_path, model_type, device_map="auto", device="cuda"):
11 |     if model_type not in ['qwen2', 'llama3']:
12 |         raise ValueError(f"Invalid Model Type {model_type}")
13 | 
14 |     kwargs = {"device_map": device_map}
15 |     kwargs['torch_dtype'] = torch.float16
16 | 
17 |     if device != "cuda":
18 |         kwargs['device_map'] = {"": device}
19 | 
20 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
21 | 
22 |     if model_type == 'llama3':
23 |         model = EVELlamaForCausalLM.from_pretrained(
24 |             model_path, low_cpu_mem_usage=True, **kwargs
25 |             )
26 |     elif model_type == 'qwen2':
27 |         model = EVEQwen2ForCausalLM.from_pretrained(
28 |             model_path, low_cpu_mem_usage=True, **kwargs
29 |             )
30 |     else:
31 |         raise ValueError(f"Invalid model_type in args: {model_type}.")
32 | 
33 |     model.resize_token_embeddings(len(tokenizer))
34 |     vision_tower = model.get_vision_tower()
35 |     vision_tower.to(device=device, dtype=torch.float16)
36 |     image_processor = vision_tower.image_processor
37 | 
38 |     if hasattr(model.config, "max_sequence_length"):
39 |         context_len = model.config.max_sequence_length
40 |     else:
41 |         context_len = 4096
42 | 
43 |     if model_type == 'llama3' and tokenizer.pad_token_id is None:
44 |         tokenizer.pad_token_id = 128002
45 |         tokenizer.pad_token = "<|reserved_special_token_0|>"
46 |     
47 |     if model.generation_config.pad_token_id is None:
48 |         model.generation_config.pad_token_id = tokenizer.pad_token_id
49 | 
50 |     return tokenizer, model, image_processor, context_len
51 | 


--------------------------------------------------------------------------------
/EVEv2/eve/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m eve.model.consolidate --src ~/model_weights/eve-7b --dst ~/model_weights/eve-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | 
10 | from eve.model import *
11 | from eve.model.utils import auto_upgrade
12 | 
13 | 
14 | def consolidate_ckpt(src_path, dst_path):
15 |     print("Loading model")
16 |     auto_upgrade(src_path)
17 |     src_model = AutoModelForCausalLM.from_pretrained(
18 |         src_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
19 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
20 |     src_model.save_pretrained(dst_path)
21 |     src_tokenizer.save_pretrained(dst_path)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--src", type=str, required=True)
27 |     parser.add_argument("--dst", type=str, required=True)
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     consolidate_ckpt(args.src, args.dst)
32 | 


--------------------------------------------------------------------------------
/EVEv2/eve/model/language_model/eve_llama3.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple, Union
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from ..eve_arch import EVEMetaModel, EVEMetaForCausalLM
 7 | from .llama3 import LlamaModel, LlamaConfig, LlamaForCausalLM
 8 | 
 9 | from transformers.modeling_outputs import CausalLMOutputWithPast
10 | 
11 | 
12 | class EVELlamaConfig(LlamaConfig):
13 |     model_type = "eve-llama3"
14 | 
15 | 
16 | class EVELlamaModel(EVEMetaModel, LlamaModel):
17 |     config_class = EVELlamaConfig
18 | 
19 |     def __init__(self, config: LlamaConfig):
20 |         super(EVELlamaModel, self).__init__(config)
21 | 
22 | 
23 | class EVELlamaForCausalLM(LlamaForCausalLM, EVEMetaForCausalLM):
24 |     config_class = EVELlamaConfig
25 | 
26 |     def __init__(self, config):
27 |         super(LlamaForCausalLM, self).__init__(config)
28 |         self.model = EVELlamaModel(config)
29 |         self.vocab_size = config.vocab_size
30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
31 | 
32 |         # Initialize weights and apply final processing
33 |         self.post_init()
34 | 
35 |     def get_model(self):
36 |         return self.model
37 | 
38 |     def forward(
39 |         self,
40 |         input_ids: torch.LongTensor = None,
41 |         attention_mask: Optional[torch.Tensor] = None,
42 |         position_ids: Optional[torch.LongTensor] = None,
43 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
44 |         inputs_embeds: Optional[torch.FloatTensor] = None,
45 |         labels: Optional[torch.LongTensor] = None,
46 |         use_cache: Optional[bool] = None,
47 |         output_attentions: Optional[bool] = None,
48 |         output_hidden_states: Optional[bool] = None,
49 |         images: Optional[torch.FloatTensor] = None,
50 |         return_dict: Optional[bool] = None,
51 |         cache_position: Optional[torch.LongTensor] = None,
52 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
53 |         
54 |         if inputs_embeds is None:
55 |             (
56 |                 input_ids,
57 |                 position_ids,
58 |                 attention_mask,
59 |                 past_key_values,
60 |                 inputs_embeds,
61 |                 labels
62 |             ) = self.prepare_inputs_labels_for_multimodal(
63 |                 input_ids,
64 |                 position_ids,
65 |                 attention_mask,
66 |                 past_key_values,
67 |                 labels,
68 |                 images
69 |             )
70 |         
71 |         return super().forward(
72 |             input_ids=input_ids,
73 |             attention_mask=attention_mask,
74 |             position_ids=position_ids,
75 |             past_key_values=past_key_values,
76 |             inputs_embeds=inputs_embeds,
77 |             labels=labels,
78 |             use_cache=use_cache,
79 |             output_attentions=output_attentions,
80 |             output_hidden_states=output_hidden_states,
81 |             return_dict=return_dict,
82 |             cache_position=None
83 |         )
84 | 
85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
86 |                                       **kwargs):
87 |         images = kwargs.pop("images", None)
88 | 
89 |         _inputs = super().prepare_inputs_for_generation(
90 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
91 |             **kwargs
92 |         )
93 | 
94 |         if images is not None:
95 |             _inputs['images'] = images
96 | 
97 |         return _inputs


--------------------------------------------------------------------------------
/EVEv2/eve/model/language_model/eve_qwen2.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple, Union
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from ..eve_arch import EVEMetaModel, EVEMetaForCausalLM
 7 | from .qwen2 import Qwen2Model, Qwen2Config, Qwen2ForCausalLM
 8 | 
 9 | from transformers.modeling_outputs import CausalLMOutputWithPast
10 | 
11 | 
12 | class EVEQwen2Config(Qwen2Config):
13 |     model_type = "eve-qwen2"
14 | 
15 | 
16 | class EVEQwen2Model(EVEMetaModel, Qwen2Model):
17 |     config_class = EVEQwen2Config
18 | 
19 |     def __init__(self, config: Qwen2Config):
20 |         super(EVEQwen2Model, self).__init__(config)
21 | 
22 | 
23 | class EVEQwen2ForCausalLM(Qwen2ForCausalLM, EVEMetaForCausalLM):
24 |     config_class = EVEQwen2Config
25 | 
26 |     def __init__(self, config):
27 |         super(Qwen2ForCausalLM, self).__init__(config)
28 |         self.model = EVEQwen2Model(config)
29 |         self.vocab_size = config.vocab_size
30 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
31 | 
32 |         # Initialize weights and apply final processing
33 |         self.post_init()
34 | 
35 |     def get_model(self):
36 |         return self.model
37 | 
38 |     def forward(
39 |             self,
40 |             input_ids: torch.LongTensor = None,
41 |             attention_mask: Optional[torch.Tensor] = None,
42 |             position_ids: Optional[torch.LongTensor] = None,
43 |             past_key_values: Optional[List[torch.FloatTensor]] = None,
44 |             inputs_embeds: Optional[torch.FloatTensor] = None,
45 |             labels: Optional[torch.LongTensor] = None,
46 |             use_cache: Optional[bool] = None,
47 |             output_attentions: Optional[bool] = None,
48 |             output_hidden_states: Optional[bool] = None,
49 |             images: Optional[torch.FloatTensor] = None,
50 |             return_dict: Optional[bool] = None,
51 |     ) -> Union[Tuple, CausalLMOutputWithPast]:
52 | 
53 |         if inputs_embeds is None:
54 |             (
55 |                 input_ids,
56 |                 position_ids,
57 |                 attention_mask,
58 |                 past_key_values,
59 |                 inputs_embeds,
60 |                 labels,
61 |                 visual_token_mask,
62 |             ) = self.prepare_inputs_labels_for_multimodal(
63 |                 input_ids,
64 |                 position_ids,
65 |                 attention_mask,
66 |                 past_key_values,
67 |                 labels,
68 |                 images
69 |             )
70 | 
71 |         return super().forward(
72 |             input_ids=input_ids,
73 |             attention_mask=attention_mask,
74 |             position_ids=position_ids,
75 |             past_key_values=past_key_values,
76 |             inputs_embeds=inputs_embeds,
77 |             labels=labels,
78 |             use_cache=use_cache,
79 |             output_attentions=output_attentions,
80 |             output_hidden_states=output_hidden_states,
81 |             return_dict=return_dict,
82 |             visual_token_mask=visual_token_mask
83 |         )
84 | 
85 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, attention_mask=None,
86 |                                       **kwargs):
87 |         images = kwargs.pop("images", None)
88 |         _inputs = super().prepare_inputs_for_generation(
89 |             input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask,
90 |             **kwargs
91 |         )
92 |         if images is not None:
93 |             _inputs['images'] = images
94 |         return _inputs
95 | 


--------------------------------------------------------------------------------
/EVEv2/eve/model/language_model/llama3/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import TYPE_CHECKING
 15 | 
 16 | from transformers.utils import (
 17 |     OptionalDependencyNotAvailable,
 18 |     _LazyModule,
 19 |     is_flax_available,
 20 |     is_sentencepiece_available,
 21 |     is_tokenizers_available,
 22 |     is_torch_available,
 23 | )
 24 | 
 25 | 
 26 | _import_structure = {
 27 |     "configuration_llama": ["LlamaConfig"],
 28 | }
 29 | 
 30 | try:
 31 |     if not is_sentencepiece_available():
 32 |         raise OptionalDependencyNotAvailable()
 33 | except OptionalDependencyNotAvailable:
 34 |     pass
 35 | else:
 36 |     _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
 37 | 
 38 | try:
 39 |     if not is_tokenizers_available():
 40 |         raise OptionalDependencyNotAvailable()
 41 | except OptionalDependencyNotAvailable:
 42 |     pass
 43 | else:
 44 |     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
 45 | 
 46 | try:
 47 |     if not is_torch_available():
 48 |         raise OptionalDependencyNotAvailable()
 49 | except OptionalDependencyNotAvailable:
 50 |     pass
 51 | else:
 52 |     _import_structure["modeling_llama"] = [
 53 |         "LlamaForCausalLM",
 54 |         "LlamaModel",
 55 |         "LlamaPreTrainedModel",
 56 |         "LlamaForSequenceClassification",
 57 |         "LlamaForQuestionAnswering",
 58 |         "LlamaForTokenClassification",
 59 |     ]
 60 | 
 61 | try:
 62 |     if not is_flax_available():
 63 |         raise OptionalDependencyNotAvailable()
 64 | except OptionalDependencyNotAvailable:
 65 |     pass
 66 | else:
 67 |     _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"]
 68 | 
 69 | 
 70 | if TYPE_CHECKING:
 71 |     from .configuration_llama import LlamaConfig
 72 | 
 73 |     try:
 74 |         if not is_sentencepiece_available():
 75 |             raise OptionalDependencyNotAvailable()
 76 |     except OptionalDependencyNotAvailable:
 77 |         pass
 78 |     else:
 79 |         from .tokenization_llama import LlamaTokenizer
 80 | 
 81 |     try:
 82 |         if not is_tokenizers_available():
 83 |             raise OptionalDependencyNotAvailable()
 84 |     except OptionalDependencyNotAvailable:
 85 |         pass
 86 |     else:
 87 |         from .tokenization_llama_fast import LlamaTokenizerFast
 88 | 
 89 |     try:
 90 |         if not is_torch_available():
 91 |             raise OptionalDependencyNotAvailable()
 92 |     except OptionalDependencyNotAvailable:
 93 |         pass
 94 |     else:
 95 |         from .modeling_llama import (
 96 |             LlamaForCausalLM,
 97 |             LlamaForQuestionAnswering,
 98 |             LlamaForSequenceClassification,
 99 |             LlamaForTokenClassification,
100 |             LlamaModel,
101 |             LlamaPreTrainedModel,
102 |         )
103 | 
104 |     try:
105 |         if not is_flax_available():
106 |             raise OptionalDependencyNotAvailable()
107 |     except OptionalDependencyNotAvailable:
108 |         pass
109 |     else:
110 |         from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
111 | 
112 | 
113 | else:
114 |     import sys
115 | 
116 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)


--------------------------------------------------------------------------------
/EVEv2/eve/model/language_model/qwen2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_tokenizers_available,
20 |     is_torch_available,
21 | )
22 | 
23 | 
24 | _import_structure = {
25 |     "configuration_qwen2": ["QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Qwen2Config"],
26 |     "tokenization_qwen2": ["Qwen2Tokenizer"],
27 | }
28 | 
29 | try:
30 |     if not is_tokenizers_available():
31 |         raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 |     pass
34 | else:
35 |     _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
36 | 
37 | try:
38 |     if not is_torch_available():
39 |         raise OptionalDependencyNotAvailable()
40 | except OptionalDependencyNotAvailable:
41 |     pass
42 | else:
43 |     _import_structure["modeling_qwen2"] = [
44 |         "Qwen2ForCausalLM",
45 |         "Qwen2Model",
46 |         "Qwen2PreTrainedModel",
47 |         "Qwen2ForSequenceClassification",
48 |     ]
49 | 
50 | 
51 | if TYPE_CHECKING:
52 |     from .configuration_qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config
53 |     from .tokenization_qwen2 import Qwen2Tokenizer
54 | 
55 |     try:
56 |         if not is_tokenizers_available():
57 |             raise OptionalDependencyNotAvailable()
58 |     except OptionalDependencyNotAvailable:
59 |         pass
60 |     else:
61 |         from .tokenization_qwen2_fast import Qwen2TokenizerFast
62 | 
63 |     try:
64 |         if not is_torch_available():
65 |             raise OptionalDependencyNotAvailable()
66 |     except OptionalDependencyNotAvailable:
67 |         pass
68 |     else:
69 |         from .modeling_qwen2 import (
70 |             Qwen2ForCausalLM,
71 |             Qwen2ForSequenceClassification,
72 |             Qwen2Model,
73 |             Qwen2PreTrainedModel,
74 |         )
75 | 
76 | 
77 | else:
78 |     import sys
79 | 
80 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)


--------------------------------------------------------------------------------
/EVEv2/eve/model/multimodal_encoder/vision_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import copy
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | from transformers import CLIPImageProcessor
 7 | 
 8 | 
 9 | class VisionTokenizer(nn.Module):
10 |     def __init__(self, input_size, output_size, vision_tower_name):
11 |         super().__init__()
12 | 
13 |         self.is_loaded = True
14 |         self.hidden_size = input_size
15 |         self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
16 | 
17 |         self.patch_stride = self.image_processor.patch_stride
18 |         self.dense_stride = self.image_processor.dense_stride
19 | 
20 |         self.patch_embedding = nn.Sequential(nn.Conv2d(3, input_size, 
21 |                                                        kernel_size=self.patch_stride, 
22 |                                                        stride=self.patch_stride),
23 |                                              nn.GELU(),
24 |                                              nn.Conv2d(input_size, output_size, 
25 |                                                        kernel_size=self.dense_stride, 
26 |                                                        stride=self.dense_stride))
27 |         self.class_embedding = nn.Parameter(torch.randn(output_size))
28 |         self.split_embedding = nn.Parameter(torch.randn(output_size))
29 | 
30 |     def forward(self, pixel_values):
31 |                 
32 |         patch_embeds = []
33 |         for i in range(len(pixel_values)):
34 |             pixel_value = pixel_values[i].to(dtype=self.dtype)
35 |             patch_embed = self.patch_embedding(pixel_value.unsqueeze(0))[0]
36 |             split_embed = self.split_embedding[:, None, None].repeat(1, patch_embed.shape[1], 1)
37 |             patch_embed = torch.cat([patch_embed, split_embed.to(dtype=self.dtype)], dim=-1)
38 | 
39 |             class_embed = self.class_embedding[None, :].to(dtype=self.dtype)
40 |             patch_embeds.append(torch.cat([class_embed, patch_embed.flatten(1).transpose(0, 1)], dim=0))
41 | 
42 |         return patch_embeds
43 |     
44 |     @property
45 |     def dtype(self):
46 |         return self.patch_embedding[0].weight.dtype
47 | 
48 |     @property
49 |     def device(self):
50 |         return self.patch_embedding[0].weight.device


--------------------------------------------------------------------------------
/EVEv2/eve/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'eve' in config and 'eve' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer EVE code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "eve")
15 |             cfg.architectures[0] = 'EVELlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/EVEv2/eve/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from eve.train.train import train
 7 | 
 8 | # from eve.train.replace_with_flash_attn import flash_attn_replace
 9 | # flash_attn_replace()
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/EVEv2/eve/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def disable_torch_init():
 5 |     """
 6 |     Disable the redundant torch default initialization to accelerate model creation.
 7 |     """
 8 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 9 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
10 | 


--------------------------------------------------------------------------------
/EVEv2/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/examples/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/examples/MAR.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/examples/MAR.jpg


--------------------------------------------------------------------------------
/EVEv2/examples/mac.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/examples/mac.jpg


--------------------------------------------------------------------------------
/EVEv2/examples/ocr_beijing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/examples/ocr_beijing.jpg


--------------------------------------------------------------------------------
/EVEv2/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/images/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/images/eve_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/images/eve_logo.png


--------------------------------------------------------------------------------
/EVEv2/images/eve_motivation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/images/eve_motivation.jpg


--------------------------------------------------------------------------------
/EVEv2/images/eve_results.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/images/eve_results.jpg


--------------------------------------------------------------------------------
/EVEv2/images/eve_structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/images/eve_structure.jpg


--------------------------------------------------------------------------------
/EVEv2/openai/eve-anyratio-res1600-patch16/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_center_crop": false,
 3 |   "do_normalize": true,
 4 |   "do_resize": false,
 5 |   "feature_extractor_type": "CLIPFeatureExtractor",
 6 |   "image_mean": [
 7 |     0.48145466,
 8 |     0.4578275,
 9 |     0.40821073
10 |   ],
11 |   "image_std": [
12 |     0.26862954,
13 |     0.26130258,
14 |     0.27577711
15 |   ],
16 |   "resample": 3,
17 |   "max_size": 1600,
18 |   "patch_stride": 16,
19 |   "dense_stride": 2
20 | }


--------------------------------------------------------------------------------
/EVEv2/openai/eve-anyratio-res800-patch16/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_center_crop": false,
 3 |   "do_normalize": true,
 4 |   "do_resize": false,
 5 |   "feature_extractor_type": "CLIPFeatureExtractor",
 6 |   "image_mean": [
 7 |     0.48145466,
 8 |     0.4578275,
 9 |     0.40821073
10 |   ],
11 |   "image_std": [
12 |     0.26862954,
13 |     0.26130258,
14 |     0.27577711
15 |   ],
16 |   "resample": 3,
17 |   "max_size": 800,
18 |   "patch_stride": 16,
19 |   "dense_stride": 2
20 | }


--------------------------------------------------------------------------------
/EVEv2/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "eve"
 7 | version = "2.0.0"
 8 | description = "EVEv2: Improved Baselines for Encoder-Free Vision-Language Models."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     'accelerate', 'apex', 'bitsandbytes', 'datasets', 'deepspeed', 'einops', 'einops-exts', "ninja", 
17 |     'fastapi', 'flash_attn', 'gradio', 'gradio_client', 'httpx', 'markdown2', 'numpy', 'openpyxl', "wandb", 
18 |     'peft', 'protobuf', 'pydantic', 'pypandoc', 'requests', 'scikit-learn', 'sentencepiece', 'shortuuid',
19 |     'tabulate', 'timm', 'tiktoken', 'tokenizers', 'torch', 'torchvision', 'transformers', 'uvicorn', 'xformers'
20 | ]
21 | 
22 | [project.urls]
23 | "Homepage" = "https://eve.github.io/"
24 | "Bug Tracker" = "https://github.com/baaivision/EVE/issues"
25 | 
26 | [tool.setuptools.packages.find]
27 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
28 | 
29 | [tool.wheel]
30 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
31 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/scripts/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument("--src", type=str)
 6 | parser.add_argument("--dst", type=str)
 7 | args = parser.parse_args()
 8 | 
 9 | all_answers = []
10 | for line_idx, line in enumerate(open(args.src)):
11 |     res = json.loads(line)
12 |     question_id = res['question_id']
13 |     text = res['text'].rstrip('.').lower()
14 |     all_answers.append({"questionId": question_id, "prediction": text})
15 | 
16 | with open(args.dst, 'w') as f:
17 |     json.dump(all_answers, f)
18 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_mmbench_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def get_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--annotation-file", type=str, required=True)
11 |     parser.add_argument("--result-dir", type=str, required=True)
12 |     parser.add_argument("--upload-dir", type=str, required=True)
13 |     parser.add_argument("--experiment", type=str, required=True)
14 | 
15 |     return parser.parse_args()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     args = get_args()
20 | 
21 |     df = pd.read_table(args.annotation_file)
22 | 
23 |     cur_df = df.copy()
24 |     cur_df = cur_df.drop(
25 |         columns=['hint', 'category', 'source', 'image', 'comment', 'l2-category'])
26 |     cur_df.insert(6, 'prediction', None)
27 |     for pred in open(os.path.join(args.result_dir, f"{args.experiment}.jsonl")):
28 |         pred = json.loads(pred)
29 |         cur_df.loc[df['index'] == pred['question_id'],
30 |                    'prediction'] = pred['text']
31 | 
32 |     cur_df.to_excel(os.path.join(args.upload_dir,
33 |                     f"{args.experiment}.xlsx"), index=False, engine='openpyxl')
34 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--src", type=str)
 7 | parser.add_argument("--dst", type=str)
 8 | args = parser.parse_args()
 9 | 
10 | cur_result = {}
11 | 
12 | for line in open(args.src):
13 |     data = json.loads(line)
14 |     qid = data['question_id']
15 |     cur_result[f'v1_{qid}'] = data['text']
16 | 
17 | with open(args.dst, 'w') as f:
18 |     json.dump(cur_result, f, indent=2)
19 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_seed_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--annotation-file", type=str)
 8 |     parser.add_argument("--result-file", type=str)
 9 |     parser.add_argument("--result-upload-file", type=str)
10 |     parser.add_argument('-t', "--tolerance", action='store_true')
11 |     return parser.parse_args()
12 | 
13 | 
14 | def eval_single(result_file, eval_only_type=None):
15 |     results = {}
16 |     for line in open(result_file):
17 |         row = json.loads(line)
18 |         results[row['question_id']] = row
19 | 
20 |     type_counts = {}
21 |     correct_counts = {}
22 |     for question_data in data['questions']:
23 |         if eval_only_type is not None and question_data['data_type'] != eval_only_type:
24 |             continue
25 |         data_type = question_data['question_type_id']
26 |         type_counts[data_type] = type_counts.get(data_type, 0) + 1
27 |         try:
28 |             question_id = int(question_data['question_id'])
29 |         except:
30 |             question_id = question_data['question_id']
31 |         if question_id not in results:
32 |             correct_counts[data_type] = correct_counts.get(data_type, 0)
33 |             continue
34 |         row = results[question_id]
35 |         if args.tolerance:
36 |             if row['text'] == question_data['answer'] or row['text'][0] == question_data['answer']:
37 |                 correct_counts[data_type] = correct_counts.get(
38 |                     data_type, 0) + 1
39 |         else:
40 |             if row['text'] == question_data['answer']:
41 |                 correct_counts[data_type] = correct_counts.get(
42 |                     data_type, 0) + 1
43 | 
44 |     total_count = 0
45 |     total_correct = 0
46 |     for data_type in sorted(type_counts.keys()):
47 |         correct_count = correct_counts.get(data_type, 0)
48 |         total_questions = type_counts[data_type]
49 | 
50 |         # 防止除以0的情况
51 |         accuracy = (correct_count / total_questions *
52 |                     100) if total_questions > 0 else 0.0
53 |         if eval_only_type is None:
54 |             print(f"{ques_type_id_to_name[data_type]}: {accuracy:.2f}%")
55 | 
56 |         total_count += type_counts[data_type]
57 |         total_correct += correct_count
58 | 
59 |     total_accuracy = total_correct / total_count * 100
60 |     if eval_only_type is None:
61 |         print(f"Total accuracy: {total_accuracy:.2f}%")
62 |     else:
63 |         print(f"{eval_only_type} accuracy: {total_accuracy:.2f}%")
64 | 
65 |     return results
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     args = get_args()
70 |     data = json.load(open(args.annotation_file))
71 |     ques_type_id_to_name = {id: n for n, id in data['question_type'].items()}
72 | 
73 |     results = eval_single(args.result_file)
74 |     eval_single(args.result_file, eval_only_type='image')
75 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_sqa_to_eve.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import fire
 5 | 
 6 | from scripts.convert_sqa_to_eve_base_prompt import build_prompt_chatbot
 7 | 
 8 | 
 9 | def convert_to_eve(base_dir, split, prompt_format="QCM-LEA"):
10 |     split_indices = json.load(
11 |         open(os.path.join(base_dir, "pid_splits.json")))[split]
12 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
13 | 
14 |     split_problems = build_prompt_chatbot(
15 |         problems, split_indices, prompt_format,
16 |         use_caption=False, is_test=False)
17 | 
18 |     target_format = []
19 |     for prob_id, (input, output) in split_problems.items():
20 |         if input.startswith('Question: '):
21 |             input = input.replace('Question: ', '')
22 |         if output.startswith('Answer: '):
23 |             output = output.replace('Answer: ', '')
24 | 
25 |         raw_prob_data = problems[prob_id]
26 |         if raw_prob_data['image'] is None:
27 |             target_format.append({
28 |                 "id": prob_id,
29 |                 "conversations": [
30 |                     {'from': 'human', 'value': f"{input}"},
31 |                     {'from': 'gpt', 'value': f"{output}"},
32 |                 ],
33 |             })
34 | 
35 |         else:
36 |             target_format.append({
37 |                 "id": prob_id,
38 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
39 |                 "conversations": [
40 |                     {'from': 'human', 'value': f"{input}\n<image>"},
41 |                     {'from': 'gpt', 'value': f"{output}"},
42 |                 ],
43 |             })
44 | 
45 |     print(f'Number of samples: {len(target_format)}')
46 | 
47 |     with open(os.path.join(base_dir, f"llava_{split}_{prompt_format}.json"), "w") as f:
48 |         json.dump(target_format, f, indent=2)
49 | 
50 | 
51 | def convert_to_jsonl(base_dir, split, prompt_format="QCM-LEPA"):
52 |     split_indices = json.load(
53 |         open(os.path.join(base_dir, "pid_splits.json")))[split]
54 |     problems = json.load(open(os.path.join(base_dir, "problems.json")))
55 | 
56 |     split_problems = build_prompt_chatbot(
57 |         problems, split_indices, prompt_format,
58 |         use_caption=False, is_test=False)
59 | 
60 |     writer = open(os.path.join(
61 |         base_dir, f"scienceqa_{split}_{prompt_format}.jsonl"), "w")
62 |     for prob_id, (input, output) in split_problems.items():
63 |         if input.startswith('Question: '):
64 |             input = input.replace('Question: ', '')
65 |         if output.startswith('Answer: '):
66 |             output = output.replace('Answer: ', '')
67 | 
68 |         raw_prob_data = problems[prob_id]
69 |         if raw_prob_data['image'] is None:
70 |             data = {
71 |                 "id": prob_id,
72 |                 "instruction": f"{input}",
73 |                 "output": f"{output}",
74 |             }
75 | 
76 |         else:
77 |             data = {
78 |                 "id": prob_id,
79 |                 "image": os.path.join(prob_id, raw_prob_data['image']),
80 |                 "instruction": f"{input}\n<image>",
81 |                 "output": f"{output}",
82 |             }
83 |         writer.write(json.dumps(data) + '\n')
84 |     writer.close()
85 | 
86 | 
87 | def main(task, **kwargs):
88 |     globals()[task](**kwargs)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     fire.Fire(main)
93 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import sys
 6 | 
 7 | from eve.eval.m4c_evaluator import EvalAIAnswerProcessor
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--annotation-file', type=str, required=True)
13 |     parser.add_argument('--result-file', type=str, required=True)
14 |     parser.add_argument('--result-upload-file', type=str, required=True)
15 |     return parser.parse_args()
16 | 
17 | 
18 | if __name__ == '__main__':
19 | 
20 |     args = parse_args()
21 | 
22 |     os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
23 | 
24 |     results = []
25 |     error_line = 0
26 |     for line_idx, line in enumerate(open(args.result_file)):
27 |         try:
28 |             results.append(json.loads(line))
29 |         except:
30 |             error_line += 1
31 |     results = {x['question_id']: x['text'] for x in results}
32 |     test_split = [json.loads(line) for line in open(args.annotation_file)]
33 |     split_ids = set([x['question_id'] for x in test_split])
34 | 
35 |     print(
36 |         f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
37 | 
38 |     all_answers = []
39 | 
40 |     answer_processor = EvalAIAnswerProcessor()
41 | 
42 |     for x in test_split:
43 |         assert x['question_id'] in results
44 |         all_answers.append({
45 |             'image': x['image'],
46 |             'answer': answer_processor(results[x['question_id']])
47 |         })
48 | 
49 |     with open(args.result_upload_file, 'w') as f:
50 |         json.dump(all_answers, f)
51 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | 
 5 | import sys
 6 | 
 7 | from eve.eval.m4c_evaluator import EvalAIAnswerProcessor
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--dir', type=str, default="./playground/data/eval/vqav2")
13 |     parser.add_argument('--ckpt', type=str, required=True)
14 |     parser.add_argument('--split', type=str, required=True)
15 |     return parser.parse_args()
16 | 
17 | 
18 | if __name__ == '__main__':
19 | 
20 |     args = parse_args()
21 | 
22 |     src = os.path.join(args.dir, 'answers', args.split, args.ckpt, 'merge.jsonl')
23 |     test_split = os.path.join(args.dir, 'llava_vqav2_mscoco_test2015.jsonl')
24 |     dst = os.path.join(args.dir, 'answers_upload', args.split, f'{args.ckpt}.json')
25 |     os.makedirs(os.path.dirname(dst), exist_ok=True)
26 | 
27 |     results = []
28 |     error_line = 0
29 |     for line_idx, line in enumerate(open(src)):
30 |         try:
31 |             results.append(json.loads(line))
32 |         except:
33 |             error_line += 1
34 | 
35 |     results = {x['question_id']: x['text'] for x in results}
36 |     test_split = [json.loads(line) for line in open(test_split)]
37 |     split_ids = set([x['question_id'] for x in test_split])
38 | 
39 |     print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}')
40 | 
41 |     all_answers = []
42 | 
43 |     answer_processor = EvalAIAnswerProcessor()
44 | 
45 |     for x in test_split:
46 |         if x['question_id'] not in results:
47 |             all_answers.append({
48 |                 'question_id': x['question_id'],
49 |                 'answer': ''
50 |             })
51 |         else:
52 |             all_answers.append({
53 |                 'question_id': x['question_id'],
54 |                 'answer': answer_processor(results[x['question_id']])
55 |             })
56 | 
57 |     with open(dst, 'w') as f:
58 |         json.dump(all_answers, open(dst, 'w'))
59 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baaivision/EVE/c42d6f4946b397d19a07e04a6536813684764ce8/EVEv2/scripts/eve/.DS_Store


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/0_eve7b_prealign_anyratio_ve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=8
22 | export NNODES=16
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export DATA_PATH=playground/data/EVE-v2.0-Pretrain/json_path/datacomp1b-10m-index.json
28 | export JSON_PATH=playground/data/EVE-v2.0-Pretrain/json_path
29 | export IMAGE_PATH=playground/data/EVE-v2.0-Pretrain/image_path
30 | export VIT_PATH=openai/eve-anyratio-res800-patch16
31 | 
32 | export BASE_LR=2e-4
33 | export LEARNIG_RATE=2e-4
34 | 
35 | export CKPT_PATH=lmsys/Qwen2.5-7B-Instruct
36 | export SAVE_PATH=EVEv2-stage0
37 | 
38 | 
39 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
40 |     eve/train/train_mem.py \
41 |     --model_name_or_path ${CKPT_PATH} \
42 |     --deepspeed ./scripts/zero3.json \
43 |     --version plain \
44 |     --model_type qwen2 \
45 |     --data_path ${DATA_PATH} \
46 |     --json_path ${JSON_PATH} \
47 |     --image_folder ${IMAGE_PATH} \
48 |     --vision_tower ${VIT_PATH} \
49 |     --tune_VE True \
50 |     --bf16 True \
51 |     --output_dir checkpoints/${SAVE_PATH} \
52 |     --num_train_epochs 1 \
53 |     --per_device_train_batch_size 8 \
54 |     --per_device_eval_batch_size 4 \
55 |     --gradient_accumulation_steps 1 \
56 |     --eval_strategy "no" \
57 |     --save_strategy "steps" \
58 |     --save_steps 4000 \
59 |     --save_total_limit 10 \
60 |     --learning_rate ${BASE_LR} \
61 |     --vision_tower_lr ${LEARNIG_RATE} \
62 |     --weight_decay 0. \
63 |     --warmup_ratio 0.03 \
64 |     --lr_scheduler_type "cosine" \
65 |     --logging_steps 1 \
66 |     --tf32 True \
67 |     --model_max_length 2048 \
68 |     --gradient_checkpointing True \
69 |     --dataloader_num_workers 4 \
70 |     --lazy_preprocess True \
71 |     --report_to tensorboard \
72 |     --run_name ${SAVE_PATH} \
73 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log
74 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/0_notrain_copy_llm_weight_into_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=1
22 | export NNODES=1
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export BASE_LR=2e-4
28 | export VE_LR=2e-4
29 | 
30 | export CKPT_PATH=checkpoints/EVEv2-stage0
31 | export SAVE_PATH=EVEv2-stage0-rep
32 | 
33 | 
34 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=0 --master_addr=$1 --master_port=$MASTER_PORT \
35 |     eve/train/repeat_moe.py \
36 |     --model_name_or_path ${CKPT_PATH} \
37 |     --deepspeed ./scripts/zero2.json \
38 |     --model_type qwen2 \
39 |     --add_moe True \
40 |     --moe_part layernorm-self_attn-mlp \
41 |     --bf16 True \
42 |     --output_dir checkpoints/${SAVE_PATH} \
43 |     --num_train_epochs 1 \
44 |     --per_device_train_batch_size 2 \
45 |     --per_device_eval_batch_size 4 \
46 |     --gradient_accumulation_steps 1 \
47 |     --eval_strategy "no" \
48 |     --save_strategy "steps" \
49 |     --save_steps 4000 \
50 |     --save_total_limit 10 \
51 |     --learning_rate ${BASE_LR} \
52 |     --vision_tower_lr ${VE_LR} \
53 |     --weight_decay 0. \
54 |     --warmup_ratio 0.03 \
55 |     --lr_scheduler_type "cosine" \
56 |     --logging_steps 1 \
57 |     --tf32 True \
58 |     --model_max_length 2048 \
59 |     --gradient_checkpointing True \
60 |     --dataloader_num_workers 4 \
61 |     --report_to tensorboard \
62 |     --run_name ${SAVE_PATH} \
63 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/1.0_eve7b_prealign_anyratio_ve_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=8
22 | export NNODES=16
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export DATA_PATH=playground/data/EVE-v2.0-Pretrain/json_path/datacomp1b-30m-index.json
28 | export JSON_PATH=playground/data/EVE-v2.0-Pretrain/json_path
29 | export IMAGE_PATH=playground/data/EVE-v2.0-Pretrain/image_path
30 | export VIT_PATH=openai/eve-anyratio-res800-patch16
31 | 
32 | export BASE_LR=1e-4
33 | export LEARNIG_RATE=1e-4
34 | 
35 | export CKPT_PATH=checkpoints/EVEv2-stage0-rep
36 | export SAVE_PATH=EVEv2-stage1.0
37 | 
38 | 
39 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
40 |     eve/train/train_mem.py \
41 |     --model_name_or_path ${CKPT_PATH} \
42 |     --deepspeed ./scripts/zero3.json \
43 |     --version plain \
44 |     --model_type qwen2 \
45 |     --data_path ${DATA_PATH} \
46 |     --json_path ${JSON_PATH} \
47 |     --image_folder ${IMAGE_PATH} \
48 |     --vision_tower ${VIT_PATH} \
49 |     --add_moe True \
50 |     --moe_part layernorm-self_attn-mlp \
51 |     --tune_VE True \
52 |     --tune_MOE True \
53 |     --bf16 True \
54 |     --output_dir checkpoints/${SAVE_PATH} \
55 |     --num_train_epochs 1 \
56 |     --per_device_train_batch_size 4 \
57 |     --per_device_eval_batch_size 4 \
58 |     --gradient_accumulation_steps 2 \
59 |     --eval_strategy "no" \
60 |     --save_strategy "steps" \
61 |     --save_steps 4000 \
62 |     --save_total_limit 10 \
63 |     --learning_rate ${BASE_LR} \
64 |     --vision_tower_lr ${LEARNIG_RATE} \
65 |     --weight_decay 0. \
66 |     --warmup_ratio 0.03 \
67 |     --lr_scheduler_type "cosine" \
68 |     --logging_steps 1 \
69 |     --tf32 True \
70 |     --model_max_length 2048 \
71 |     --gradient_checkpointing True \
72 |     --dataloader_num_workers 4 \
73 |     --lazy_preprocess True \
74 |     --report_to tensorboard \
75 |     --run_name ${SAVE_PATH} \
76 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/1.1_eve7b_prealign_anyratio_ve_moe_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=8
22 | export NNODES=16
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export DATA_PATH=playground/data/EVE-v2.0-Pretrain/json_path/laion-openimages-sam-datacomp1b-48m-index.json
28 | export JSON_PATH=playground/data/EVE-v2.0-Pretrain/json_path
29 | export IMAGE_PATH=playground/data/EVE-v2.0-Pretrain/image_path
30 | export VIT_PATH=openai/eve-anyratio-res1600-patch16
31 | 
32 | export BASE_LR=5e-5
33 | export LEARNIG_RATE=5e-5
34 | 
35 | export CKPT_PATH=checkpoints/EVEv2-stage1.0
36 | export SAVE_PATH=EVEv2-stage1.1
37 | 
38 | 
39 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
40 |     eve/train/train_mem.py \
41 |     --model_name_or_path ${CKPT_PATH} \
42 |     --deepspeed ./scripts/zero3.json \
43 |     --version plain \
44 |     --model_type qwen2 \
45 |     --data_path ${DATA_PATH} \
46 |     --json_path ${JSON_PATH} \
47 |     --image_folder ${IMAGE_PATH} \
48 |     --vision_tower ${VIT_PATH} \
49 |     --add_moe True \
50 |     --moe_part layernorm-self_attn-mlp \
51 |     --tune_VE True \
52 |     --tune_MOE True \
53 |     --bf16 True \
54 |     --output_dir checkpoints/${SAVE_PATH} \
55 |     --num_train_epochs 1 \
56 |     --per_device_train_batch_size 2 \
57 |     --per_device_eval_batch_size 4 \
58 |     --gradient_accumulation_steps 4 \
59 |     --eval_strategy "no" \
60 |     --save_strategy "steps" \
61 |     --save_steps 4000 \
62 |     --save_total_limit 10 \
63 |     --learning_rate ${BASE_LR} \
64 |     --vision_tower_lr ${LEARNIG_RATE} \
65 |     --weight_decay 0. \
66 |     --warmup_ratio 0.03 \
67 |     --lr_scheduler_type "cosine" \
68 |     --logging_steps 1 \
69 |     --tf32 True \
70 |     --model_max_length 4096 \
71 |     --gradient_checkpointing True \
72 |     --dataloader_num_workers 4 \
73 |     --lazy_preprocess True \
74 |     --report_to tensorboard \
75 |     --run_name ${SAVE_PATH} \
76 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/2_eve7b_fullalign_anyratio_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=8
22 | export NNODES=16
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export DATA_PATH=playground/data/EVE-v2.0-Pretrain/json_path/infinity-mm-generalqa-index.json
28 | export JSON_PATH=playground/data/EVE-v2.0-Pretrain/json_path
29 | export IMAGE_PATH=playground/data/EVE-v2.0-Pretrain/image_path
30 | export VIT_PATH=openai/eve-anyratio-res1600-patch16
31 | 
32 | export BASE_LR=2e-5
33 | export LEARNIG_RATE=2e-5
34 | 
35 | export CKPT_PATH=checkpoints/EVEv2-stage1.1
36 | export SAVE_PATH=EVEv2-stage2
37 | 
38 | 
39 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
40 |     eve/train/train_mem.py \
41 |     --model_name_or_path ${CKPT_PATH} \
42 |     --deepspeed ./scripts/zero3.json \
43 |     --version qwen2 \
44 |     --model_type qwen2 \
45 |     --data_path ${DATA_PATH} \
46 |     --json_path ${JSON_PATH} \
47 |     --image_folder ${IMAGE_PATH} \
48 |     --vision_tower ${VIT_PATH} \
49 |     --add_moe True \
50 |     --moe_part layernorm-self_attn-mlp \
51 |     --tune_LLM True \
52 |     --tune_VE True \
53 |     --tune_MOE True \
54 |     --bf16 True \
55 |     --output_dir checkpoints/${SAVE_PATH} \
56 |     --num_train_epochs 1 \
57 |     --per_device_train_batch_size 2 \
58 |     --per_device_eval_batch_size 4 \
59 |     --gradient_accumulation_steps 2 \
60 |     --eval_strategy "no" \
61 |     --save_strategy "steps" \
62 |     --save_steps 4000 \
63 |     --save_total_limit 10 \
64 |     --learning_rate ${BASE_LR} \
65 |     --vision_tower_lr ${LEARNIG_RATE} \
66 |     --weight_decay 0. \
67 |     --warmup_ratio 0.03 \
68 |     --lr_scheduler_type "cosine" \
69 |     --logging_steps 1 \
70 |     --tf32 True \
71 |     --model_max_length 4096 \
72 |     --gradient_checkpointing True \
73 |     --dataloader_num_workers 4 \
74 |     --lazy_preprocess True \
75 |     --report_to tensorboard \
76 |     --run_name ${SAVE_PATH} \
77 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/3_eve7b_finetune_anyratio_hd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export NCCL_SOCKET_IFNAME=eth0
 3 | export NCCL_IB_DISABLE=0
 4 | export NCCL_IB_CUDA_SUPPORT=1
 5 | export NCCL_IB_GID_INDEX=0
 6 | export NCCL_DEBUG=INFO
 7 | export NCCL_IB_TIMEOUT=23
 8 | export NCCL_IB_RETRY_CNT=7
 9 | export NCCL_IB_HCA=mlx5_2,mlx5_5
10 | export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
11 | 
12 | set -x
13 | 
14 | wandb offline
15 | apt-get install -y libibverbs1
16 | apt-get install -y libaio-dev
17 | 
18 | mkdir -p logs
19 | 
20 | export CUDA_DEVICE_MAX_CONNECTIONS=1
21 | export GPUS_PER_NODE=8
22 | export NNODES=16
23 | export MASTER_PORT=12345
24 | export CPUS_PER_TASK=32
25 | export QUOTA=reserved
26 | 
27 | export DATA_PATH=playground/data/EVE-v2.0-SFT/infinity-mm-llava-ov-index.json
28 | export JSON_PATH=playground/data/EVE-v2.0-SFT/
29 | export IMAGE_PATH=playground/data/EVE-v2.0-SFT/image_path
30 | export VIT_PATH=openai/eve-anyratio-res1600-patch16
31 | export PRE_LEN=playground/data/EVE-v2.0-SFT/conversation_presave_lengths.json # option to save loading time
32 | export PRE_MOLEN=playground/data/EVE-v2.0-SFT/conversation_presave_modality_lengths.json # option to save loading time
33 | 
34 | export BASE_LR=1e-5
35 | export LEARNIG_RATE=1e-5
36 | 
37 | export CKPT_PATH=checkpoints/EVEv2-stage2
38 | export SAVE_PATH=EVE-7B-HD-v2.0
39 | 
40 | 
41 | torchrun --nproc_per_node=$GPUS_PER_NODE --nnode=$NNODES --node_rank=$1 --master_addr=$2 --master_port=$MASTER_PORT \
42 |     eve/train/train_mem.py \
43 |     --model_name_or_path ${CKPT_PATH} \
44 |     --deepspeed ./scripts/zero3.json \
45 |     --version qwen2 \
46 |     --model_type qwen2 \
47 |     --data_path ${DATA_PATH} \
48 |     --json_path ${JSON_PATH} \
49 |     --image_folder ${IMAGE_PATH} \
50 |     --vision_tower ${VIT_PATH} \
51 |     --group_by_modality_length True \
52 |     --presave_lengths ${PRE_LEN} \
53 |     --presave_modality_lengths ${PRE_MOLEN} \
54 |     --add_moe True \
55 |     --moe_part layernorm-self_attn-mlp \
56 |     --tune_LLM True \
57 |     --tune_VE True \
58 |     --tune_MOE True \
59 |     --bf16 True \
60 |     --output_dir checkpoints/${SAVE_PATH} \
61 |     --num_train_epochs 1 \
62 |     --per_device_train_batch_size 2 \
63 |     --per_device_eval_batch_size 4 \
64 |     --gradient_accumulation_steps 2 \
65 |     --eval_strategy "no" \
66 |     --save_strategy "steps" \
67 |     --save_steps 4000 \
68 |     --save_total_limit 10 \
69 |     --learning_rate ${BASE_LR} \
70 |     --vision_tower_lr ${LEARNIG_RATE} \
71 |     --weight_decay 0. \
72 |     --warmup_ratio 0.03 \
73 |     --lr_scheduler_type "cosine" \
74 |     --logging_steps 1 \
75 |     --tf32 True \
76 |     --model_max_length 5000 \
77 |     --gradient_checkpointing True \
78 |     --dataloader_num_workers 4 \
79 |     --lazy_preprocess True \
80 |     --report_to tensorboard \
81 |     --run_name ${SAVE_PATH} \
82 |     2>&1 | tee logs/${SAVE_PATH}-rank$1.log


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/gqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | CONV_MODE=$3
10 | 
11 | SPLIT="eve_gqa_testdev_balanced"
12 | GQADIR="./playground/data/eval/gqa/data"
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
16 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
17 |         --model-type $CONV_MODE \
18 |         --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
19 |         --image-folder ./playground/data/eval/gqa/data/images \
20 |         --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl \
21 |         --num-chunks $CHUNKS \
22 |         --chunk-idx $IDX \
23 |         --conv-mode $CONV_MODE &
24 | done
25 | 
26 | wait
27 | 
28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/merge.jsonl
29 | 
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 | 
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 |     cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 | 
38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
39 | 
40 | cd $GQADIR
41 | python eval/eval.py --tier testdev_balanced
42 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/llavabench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | python -m eve.eval.model_vqa \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --model-type $CONV_MODE \
 9 |     --question-file ./playground/data/eval/llava-bench-in-the-wild/resources/questions.jsonl \
10 |     --image-folder ./playground/data/eval/llava-bench-in-the-wild/resources/images \
11 |     --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl \
12 |     --conv-mode $CONV_MODE
13 | 
14 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
15 | 
16 | python eve/eval/eval_gpt_review_bench.py \
17 |     --question playground/data/eval/llava-bench-in-the-wild/resources/questions.jsonl \
18 |     --context playground/data/eval/llava-bench-in-the-wild/resources/context.jsonl \
19 |     --rule eve/eval/table/rule.json \
20 |     --answer-list \
21 |         playground/data/eval/llava-bench-in-the-wild/resources/answers_gpt4.jsonl \
22 |         playground/data/eval/llava-bench-in-the-wild/answers/${CKPT_NAME}.jsonl \
23 |     --output \
24 |         playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT_NAME}-eval1.jsonl
25 | 
26 | python eve/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/${CKPT_NAME}-eval1.jsonl
27 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | CHUNKS=${#GPULIST[@]}
 5 | 
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | CONV_MODE=$3 
 9 | 
10 | LANG="cn"
11 | SPLIT="mmbench_dev_cn_20231003"
12 | 
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_mmbench \
16 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
17 |         --model-type $CONV_MODE \
18 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
19 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --lang $LANG \
23 |         --single-pred-prompt \
24 |         --conv-mode $CONV_MODE &
25 | done
26 | 
27 | wait
28 | 
29 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/merge.jsonl
30 | 
31 | # Clear out the output file if it exists.
32 | > "$output_file"
33 | 
34 | # Loop through the indices and concatenate each file.
35 | for IDX in $(seq 0 $((CHUNKS-1))); do
36 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
37 | done
38 | 
39 | wait
40 | 
41 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
42 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME}
43 | 
44 | python scripts/convert_mmbench_for_submission.py \
45 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
46 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME} \
47 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME} \
48 |     --experiment merge
49 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/mmbench_en.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | CHUNKS=${#GPULIST[@]}
 5 | 
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | CONV_MODE=$3 
 9 | 
10 | SPLIT="mmbench_dev_20230712"
11 | LANG="en"
12 | 
13 | 
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_mmbench \
16 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
17 |         --model-type $CONV_MODE \
18 |         --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
19 |         --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --lang $LANG \
23 |         --single-pred-prompt \
24 |         --conv-mode $CONV_MODE &
25 | done
26 | 
27 | wait
28 | 
29 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/merge.jsonl
30 | 
31 | # Clear out the output file if it exists.
32 | > "$output_file"
33 | 
34 | # Loop through the indices and concatenate each file.
35 | for IDX in $(seq 0 $((CHUNKS-1))); do
36 |     cat ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
37 | done
38 | 
39 | wait
40 | 
41 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
42 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME}
43 | 
44 | python scripts/convert_mmbench_for_submission.py \
45 |     --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
46 |     --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/${CKPT_NAME} \
47 |     --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT/${CKPT_NAME} \
48 |     --experiment merge
49 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/mme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | python -m eve.eval.model_vqa_loader \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --model-type $CONV_MODE \
 9 |     --question-file ./playground/data/eval/MME/eve_mme.jsonl \
10 |     --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
11 |     --answers-file ./playground/data/eval/MME/answers/${CKPT_NAME}.jsonl \
12 |     --conv-mode $CONV_MODE
13 | 
14 | cd ./playground/data/eval/MME
15 | 
16 | python convert_answer_to_mme.py --experiment ${CKPT_NAME}
17 | 
18 | cd eval_tool
19 | 
20 | python calculation.py --results_dir answers/${CKPT_NAME}
21 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/mmvet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | python -m eve.eval.model_vqa \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --model-type $CONV_MODE \
 9 |     --question-file ./playground/data/eval/mm-vet/eve-mm-vet.jsonl \
10 |     --image-folder ./playground/data/eval/mm-vet/images \
11 |     --answers-file ./playground/data/eval/mm-vet/answers/${CKPT_NAME}.jsonl \
12 |     --conv-mode $CONV_MODE
13 | 
14 | mkdir -p ./playground/data/eval/mm-vet/results
15 | 
16 | python scripts/convert_mmvet_for_eval.py \
17 |     --src ./playground/data/eval/mm-vet/answers/${CKPT_NAME}.jsonl \
18 |     --dst ./playground/data/eval/mm-vet/results/${CKPT_NAME}.json
19 | 
20 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/pope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | python -m eve.eval.model_vqa_loader \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --model-type $CONV_MODE \
 9 |     --question-file ./playground/data/eval/pope/eve_pope_test.jsonl \
10 |     --image-folder ./playground/data/eval/pope/val2014 \
11 |     --answers-file ./playground/data/eval/pope/answers/${CKPT_NAME}.jsonl \
12 |     --conv-mode $CONV_MODE
13 | 
14 | python eve/eval/eval_pope.py \
15 |     --annotation-dir ./playground/data/eval/pope/coco \
16 |     --question-file ./playground/data/eval/pope/eve_pope_test.jsonl \
17 |     --result-file ./playground/data/eval/pope/answers/${CKPT_NAME}.jsonl
18 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/qbench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | SPLIT="dev"
 7 | 
 8 | python -m eve.eval.model_vqa_qbench \
 9 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
10 |     --model-type $CONV_MODE \
11 |     --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
12 |     --questions-file ./playground/data/eval/qbench/llvisionqa_${SPLIT}.json \
13 |     --answers-file ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl \
14 |     --conv-mode $CONV_MODE \
15 |     --lang en
16 | 
17 | python playground/data/eval/qbench/format_qbench.py \
18 |     --filepath ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl
19 | 
20 | python playground/data/eval/qbench/qbench_eval.py \
21 |     --filepath ./playground/data/eval/qbench/llvisionqa_${SPLIT}_${CKPT_NAME}_answers.jsonl


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | CONV_MODE=$3 
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
13 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
14 |         --model-type $CONV_MODE \
15 |         --question-file ./playground/data/eval/seed_bench/eve-seed-bench-image.jsonl \
16 |         --image-folder ./playground/data/eval/seed_bench \
17 |         --answers-file ./playground/data/eval/seed_bench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
18 |         --num-chunks $CHUNKS \
19 |         --chunk-idx $IDX \
20 |         --conv-mode $CONV_MODE &
21 | done
22 | 
23 | wait
24 | 
25 | output_file=./playground/data/eval/seed_bench/answers/${CKPT_NAME}/merge.jsonl
26 | 
27 | # Clear out the output file if it exists.
28 | > "$output_file"
29 | 
30 | # Loop through the indices and concatenate each file.
31 | for IDX in $(seq 0 $((CHUNKS-1))); do
32 |     cat ./playground/data/eval/seed_bench/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
33 | done
34 | 
35 | # Evaluate
36 | python scripts/convert_seed_for_submission.py \
37 |     --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
38 |     --result-file $output_file \
39 |     --result-upload-file ./playground/data/eval/seed_bench/answers_upload/${CKPT_NAME}.jsonl
40 | 
41 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/sqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | CONV_MODE=$3 
10 | 
11 | for IDX in $(seq 0 $((CHUNKS-1))); do
12 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_science \
13 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
14 |     --model-type $CONV_MODE \
15 |     --question-file ./playground/data/eval/scienceqa/eve_test_CQM-A.json \
16 |     --image-folder ./playground/data/eval/scienceqa/images/test \
17 |     --answers-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
18 |     --num-chunks $CHUNKS \
19 |     --chunk-idx $IDX \
20 |     --single-pred-prompt \
21 |     --conv-mode $CONV_MODE &
22 | done
23 | 
24 | wait
25 | 
26 | output_file=./playground/data/eval/scienceqa/answers/${CKPT_NAME}/merge.jsonl
27 | 
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 | 
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 |     cat ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 | 
36 | python eve/eval/eval_science_qa.py \
37 |     --base-dir ./playground/data/eval/scienceqa \
38 |     --result-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/merge.jsonl \
39 |     --output-file ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/output.jsonl \
40 |     --output-result ./playground/data/eval/scienceqa/answers/${CKPT_NAME}/result.json
41 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/textvqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | CKPT_NAME=$1
 7 | CKPT_PATH=$2
 8 | CONV_MODE=$3 
 9 | 
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
12 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
13 |         --model-type $CONV_MODE \
14 |         --question-file ./playground/data/eval/textvqa/eve_textvqa_val_v051_ocr.jsonl \
15 |         --image-folder ./playground/data/eval/textvqa/images \
16 |         --answers-file ./playground/data/eval/textvqa/answers/${CKPT_NAME}/${CHUNKS}_${IDX}.jsonl \
17 |         --num-chunks $CHUNKS \
18 |         --chunk-idx $IDX \
19 |         --conv-mode $CONV_MODE &
20 | done
21 | 
22 | wait
23 | 
24 | output_file=./playground/data/eval/textvqa/answers/$CKPT_NAME/merge.jsonl
25 | 
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 | 
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 |     cat ./playground/data/eval/textvqa/answers/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 | 
34 | python -m eve.eval.eval_textvqa \
35 |     --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
36 |     --result-file ./playground/data/eval/textvqa/answers/${CKPT_NAME}/merge.jsonl
37 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/vizwiz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CKPT_NAME=$1
 3 | CKPT_PATH=$2
 4 | CONV_MODE=$3 
 5 | 
 6 | python -m eve.eval.model_vqa_loader \
 7 |     --model-path ${CKPT_PATH}/${CKPT_NAME} \
 8 |     --model-type $CONV_MODE \
 9 |     --question-file ./playground/data/eval/vizwiz/eve_test.jsonl \
10 |     --image-folder ./playground/data/eval/vizwiz/test \
11 |     --answers-file ./playground/data/eval/vizwiz/answers/${CKPT_NAME}.jsonl \
12 |     --conv-mode $CONV_MODE
13 | 
14 | python scripts/convert_vizwiz_for_submission.py \
15 |     --annotation-file ./playground/data/eval/vizwiz/eve_test.jsonl \
16 |     --result-file ./playground/data/eval/vizwiz/answers/${CKPT_NAME}.jsonl \
17 |     --result-upload-file ./playground/data/eval/vizwiz/answers_upload/${CKPT_NAME}.json
18 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/eval/vqav2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
 3 | IFS=',' read -ra GPULIST <<< "$gpu_list"
 4 | 
 5 | CHUNKS=${#GPULIST[@]}
 6 | 
 7 | CKPT_NAME=$1
 8 | CKPT_PATH=$2
 9 | CONV_MODE=$3 
10 | 
11 | SPLIT='eve_vqav2_mscoco_test-dev2015'
12 | 
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 |     CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m eve.eval.model_vqa_loader \
15 |         --model-path ${CKPT_PATH}/${CKPT_NAME} \
16 |         --model-type $CONV_MODE \
17 |         --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
18 |         --image-folder ./playground/data/eval/vqav2/test2015 \
19 |         --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl \
20 |         --num-chunks $CHUNKS \
21 |         --chunk-idx $IDX \
22 |         --conv-mode $CONV_MODE &
23 | done
24 | 
25 | wait
26 | 
27 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/merge.jsonl
28 | 
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 | 
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 |     cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT_NAME/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 | 
37 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT_NAME
38 | 
39 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/eve/test_all_benchmark.sh:
--------------------------------------------------------------------------------
 1 | CKPT_NAME='EVE-7B-HD-v2.0'
 2 | CKPT_PATH='checkpoints'
 3 | CONV_MODE='qwen2'
 4 | 
 5 | mkdir -p log_results
 6 | 
 7 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/seed.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE} 2>&1 | tee log_results/${CKPT_NAME}_seed
 8 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/mmbench_en.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE}  2>&1 | tee log_results/${CKPT_NAME}_mmbench_en
 9 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/gqa.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE}  2>&1 | tee log_results/${CKPT_NAME}_gqa
10 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/sqa.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE}  2>&1 | tee log_results/${CKPT_NAME}_sqa
11 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/textvqa.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE}  2>&1 | tee log_results/${CKPT_NAME}_textvqa
12 | CUDA_VISIBLE_DEVICES=1 bash scripts/eve/eval/mmvet.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE} 2>&1 | tee log_results/${CKPT_NAME}_mmvet
13 | CUDA_VISIBLE_DEVICES=0 bash scripts/eve/eval/mme.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE} 2>&1 | tee log_results/${CKPT_NAME}_mme
14 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/eve/eval/vqav2.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE}  2>&1 | tee log_results/${CKPT_NAME}_vqav2
15 | CUDA_VISIBLE_DEVICES=2 bash scripts/eve/eval/pope.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE} 2>&1 | tee log_results/${CKPT_NAME}_pope
16 | CUDA_VISIBLE_DEVICES=3 bash scripts/eve/eval/vizwiz.sh ${CKPT_NAME} ${CKPT_PATH} ${CONV_MODE} 2>&1 | tee log_results/${CKPT_NAME}_vizwiz
17 | 


--------------------------------------------------------------------------------
/EVEv2/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 2,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto"
22 |     }
23 | }


--------------------------------------------------------------------------------
/EVEv2/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/EVEv2/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "offload_optimizer": {
33 |       "device": "cpu",
34 |       "pin_memory": true
35 |     },
36 |     "offload_param": {
37 |       "device": "cpu",
38 |       "pin_memory": true
39 |     },
40 |     "overlap_comm": true,
41 |     "contiguous_gradients": true,
42 |     "sub_group_size": 1e9,
43 |     "reduce_bucket_size": "auto",
44 |     "stage3_prefetch_bucket_size": "auto",
45 |     "stage3_param_persistence_threshold": "auto",
46 |     "stage3_max_live_parameters": 1e9,
47 |     "stage3_max_reuse_distance": 1e9,
48 |     "gather_16bit_weights_on_model_save": true
49 |   },
50 |   "gradient_accumulation_steps": "auto",
51 |   "gradient_clipping": "auto",
52 |   "train_batch_size": "auto",
53 |   "train_micro_batch_size_per_gpu": "auto",
54 |   "steps_per_print": 1e5,
55 |   "wall_clock_breakdown": false
56 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 BAAI-Vision
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # <img src="EVEv1/images/eve_logo.png" style="vertical-align: -10px;" :height="30px" width="30px"> EVE Series: Encoder-Free VLMs from BAAI
 2 | 
 3 | <p align="center">
 4 |   <img src="EVEv1/images/eve_motivation1.png">
 5 | </p>
 6 | 
 7 | - **2024/05**: [EVEv1](https://github.com/baaivision/EVE/blob/main/EVEv1/README.md) - Unveiling Encoder-Free Vision-Language Models (NeurIPS 2024, Spotlight)
 8 | 
 9 | - **2024/11**: [EVEv2](https://github.com/baaivision/EVE/blob/main/EVEv2/README.md) - EVEv2: Improved Baselines for Encoder-Free Vision-Language Models (ArXiv 2025) 
10 | 
11 | 
12 | ## 💡 Motivation
13 | 
14 | - **Can we remove vision encoder from VLMs?**
15 | 
16 | - **How to transfer an LLM to an encoder-free VLM efficiently and stably?**
17 |   
18 | - **How to bridge the performance gap between encoder-free and encoder-based VLMs?** 
19 | 
20 | ## 📜 News
21 | [2025/02/09] 🔥🔥🔥 The [paper](https://arxiv.org/abs/2502.06788), [weights](https://huggingface.co/BAAI/EVE-7B-HD-v2.0), and [code](https://github.com/baaivision/EVE/blob/main/EVEv2/README.md) of **EVEv2** are released ! 💥    
22 | [2024/09/26] Our **EVE** has been accepted by **NeurIPS 2024** (**spotlight**) ! 💥       
23 | [2024/06/18] The [paper](https://arxiv.org/abs/2406.11832), [weights](https://huggingface.co/BAAI/EVE-7B-HD-v1.0), and [code](https://github.com/baaivision/EVE/blob/main/EVEv1/README.md) of **EVE** are released ! 💥   
24 | 
25 | ## 💡 Highlights
26 | - 🔥 **Superior Capability:** *An originated encoder-free* LVLM with *arbitrary* image aspect ratio, outperforming the counterparts and approaching existing *modular encoder-based* LVLMs.  
27 | 
28 | - 🔥 **Data Efficiency:** Filter and recaption solely *<100M* publicly avaliable data from OpenImages, SAM, LAION, Datacomp for pre-training.  
29 | 
30 | - 🔥 **Pioneering Route:** We attempt to provide an *efficient*, *transparent*, and *practical* training strategy and procedure for developing a pure decoder-only architecture across modalities.  
31 | 
32 | 
33 | 
34 | ## ✒️ Citation 
35 | If **EVE** is helpful for your research, please consider **star** ⭐ and **citation** 📝 :
36 | ```bibtex
37 | @article{diao2024EVE,
38 |   title={Unveiling Encoder-Free Vision-Language Models},
39 |   author={Diao, Haiwen and Cui, Yufeng and Li, Xiaotong and Wang, Yueze and Lu, Huchuan and Wang, Xinlong},
40 |   journal={arXiv preprint arXiv:2406.11832},
41 |   year={2024}
42 | }
43 | ```
44 | 
45 | ```bibtex
46 | @article{diao2025EVEv2,
47 |   title={EVEv2: Improved Baselines for Encoder-Free Vision-Language Models},
48 |   author={Diao, Haiwen and Li, Xiaotong and Cui, Yufeng and Wang, Yueze and Deng, Haoge and Pan, Ting and Wang, Wenxuan and Lu, Huchuan and Wang, Xinlong},
49 |   journal={arXiv preprint arXiv:2502.06788},
50 |   year={2025}
51 | }
52 | ```
53 | 
54 | ## 📄 License 
55 | The content of this project itself is licensed under [LICENSE](https://github.com/baaivision/EVE/blob/main/LICENSE).
56 | 


--------------------------------------------------------------------------------