├── mixeval_x ├── utils │ ├── __init__.py │ ├── count_token.py │ ├── gemini_api_script.py │ ├── common_utils.py │ ├── judge_freeform_parser.py │ ├── judge_multichoice_parser.py │ ├── judge_text2action.py │ ├── judge_image2action.py │ ├── metric_utils.py │ └── judge_text2image.py ├── examples │ └── model_response_dir │ │ ├── audio2text │ │ └── gemini_1.5_pro │ │ │ └── gemini_1.5_pro_ff.jsonl │ │ ├── audio2text_hard │ │ └── gemini_1.5_pro │ │ │ └── gemini_1.5_pro_ff.jsonl │ │ ├── image2text │ │ └── gemini_1.5 │ │ │ ├── gemini_1.5_ff.jsonl │ │ │ └── gemini_1.5_mp.jsonl │ │ ├── image2text_hard │ │ └── gemini_1.5 │ │ │ ├── gemini_1.5_ff.jsonl │ │ │ └── gemini_1.5_mp.jsonl │ │ ├── video2text │ │ └── gemini_1_5_pro │ │ │ ├── gemini_1_5_pro_ff.jsonl │ │ │ └── gemini_1_5_pro_mp.jsonl │ │ ├── video2text_hard │ │ └── gemini_1_5_pro │ │ │ ├── gemini_1_5_pro_ff.jsonl │ │ │ └── gemini_1_5_pro_mp.jsonl │ │ ├── text2image │ │ └── dall-e-3 │ │ │ └── dall-e-3_t2i.jsonl │ │ ├── image2action │ │ └── gemini_1.5 │ │ │ └── gemini_1.5_i2a.jsonl │ │ └── text2action │ │ └── gemini_15_pro │ │ └── gemini_15_pro_t2a.jsonl ├── prompts │ ├── evaluation_prompts.py │ ├── evaluation_prompts_image2text.py │ └── judge_prompts.py ├── compute_metrics_mmg_agent.py └── compute_metrics_mmu.py ├── setup.sh ├── resources └── imgs │ ├── header.png │ └── mixeval-x.jpg ├── setup.py ├── .gitignore ├── docs └── model_output_formats.md └── README.md /mixeval_x/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install -e . -------------------------------------------------------------------------------- /resources/imgs/header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/MixEval-X/HEAD/resources/imgs/header.png -------------------------------------------------------------------------------- /resources/imgs/mixeval-x.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JinjieNi/MixEval-X/HEAD/resources/imgs/mixeval-x.jpg -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/audio2text/gemini_1.5_pro/gemini_1.5_pro_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "audio_id": "1.wav", "prompt": "Is the frog making noise?", "target": ["yes"], "benchmark_name": "Clotho-AQA", "response": "Yes.", "id": "1"} 2 | {"problem_type": "free-form", "audio_id": "2.wav", "prompt": "Is someone blowing a whistle?", "target": ["no"], "benchmark_name": "Clotho-AQA", "response": "Yes", "id": "2"} 3 | {"problem_type": "free-form", "audio_id": "3.wav", "prompt": "What did you listen to after listening to the door closing?", "target": ["crowd babbling"], "benchmark_name": "DAQA", "response": "Silence", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/audio2text_hard/gemini_1.5_pro/gemini_1.5_pro_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "audio_id": "1.wav", "prompt": "Is the frog making noise?", "target": ["yes"], "benchmark_name": "Clotho-AQA", "response": "Yes.", "id": "1"} 2 | {"problem_type": "free-form", "audio_id": "2.wav", "prompt": "Is someone blowing a whistle?", "target": ["no"], "benchmark_name": "Clotho-AQA", "response": "Yes", "id": "2"} 3 | {"problem_type": "free-form", "audio_id": "3.wav", "prompt": "What did you listen to after listening to the door closing?", "target": ["crowd babbling"], "benchmark_name": "DAQA", "response": "Silence", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/image2text/gemini_1.5/gemini_1.5_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "image_id": "1.jpg", "prompt": "Where are the people that are standing looking at?", "target": ["away"], "benchmark_name": "GQA", "response": "The kite", "id": "1"} 2 | {"problem_type": "free-form", "image_id": "2.jpg", "prompt": "is this an online museum?", "target": ["yes", "yes", "yes", "no", "yes", "yes", "yes", "the louvre", "yes", "yes"], "benchmark_name": "TextVQA", "response": "Not likely.", "id": "2"} 3 | {"problem_type": "free-form", "image_id": "3.jpg", "prompt": "On which side of the photo is the silk kite?", "target": ["right"], "benchmark_name": "GQA", "response": "The left side", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/image2text_hard/gemini_1.5/gemini_1.5_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "image_id": "1.jpg", "prompt": "Where are the people that are standing looking at?", "target": ["away"], "benchmark_name": "GQA", "response": "The kite", "id": "1"} 2 | {"problem_type": "free-form", "image_id": "2.jpg", "prompt": "is this an online museum?", "target": ["yes", "yes", "yes", "no", "yes", "yes", "yes", "the louvre", "yes", "yes"], "benchmark_name": "TextVQA", "response": "Not likely.", "id": "2"} 3 | {"problem_type": "free-form", "image_id": "3.jpg", "prompt": "On which side of the photo is the silk kite?", "target": ["right"], "benchmark_name": "GQA", "response": "The left side", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/video2text/gemini_1_5_pro/gemini_1_5_pro_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "video_id": "1.mp4", "prompt": "is playing football difficult", "target": ["yes"], "benchmark_name": "ActivityNet-QA", "response": "Yes.", "id": "1"} 2 | {"problem_type": "free-form", "video_id": "2.mp4", "prompt": "What is the idea of the advertisement?", "target": ["steam clean", "steam clean", "cleaning", "car", "steam clean"], "benchmark_name": "IVQA", "response": "Car steam cleaning. \n", "id": "2"} 3 | {"problem_type": "free-form", "video_id": "3.mp4", "prompt": "What does the location of video look like?", "target": ["labotory", "laboratory", "laboratory", "laboratory", "laboratory"], "benchmark_name": "IVQA", "response": "Laboratory", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/video2text_hard/gemini_1_5_pro/gemini_1_5_pro_ff.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "free-form", "video_id": "1.mp4", "prompt": "is playing football difficult", "target": ["yes"], "benchmark_name": "ActivityNet-QA", "response": "Yes.", "id": "1"} 2 | {"problem_type": "free-form", "video_id": "2.mp4", "prompt": "What is the idea of the advertisement?", "target": ["steam clean", "steam clean", "cleaning", "car", "steam clean"], "benchmark_name": "IVQA", "response": "Car steam cleaning. \n", "id": "2"} 3 | {"problem_type": "free-form", "video_id": "3.mp4", "prompt": "What does the location of video look like?", "target": ["labotory", "laboratory", "laboratory", "laboratory", "laboratory"], "benchmark_name": "IVQA", "response": "Laboratory", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/image2text/gemini_1.5/gemini_1.5_mp.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "single-choice", "image_id": "1.jpg", "prompt": "Is the man's face clearly visible in the image?", "options": ["Yes", "No"], "target": [1], "benchmark_name": "Q-Bench", "response": "B."} 2 | {"problem_type": "single-choice", "image_id": "2.jpg", "prompt": "Are person1 and person2 the same person ?", "options": ["Yes they are related .", "person1 and person2 are not on the same side .", "Yes , they are students in the same class at school .", "Yes , person1 and person2 are the same ."], "target": [3], "benchmark_name": "SEED2", "response": "D."} 3 | {"problem_type": "single-choice", "image_id": "3.jpg", "prompt": "Where are the cast of the television show located in the image?", "options": ["In the foreground", "In the background", "In the center", "At the edges"], "target": [2], "benchmark_name": "SEED", "response": "A"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/image2text_hard/gemini_1.5/gemini_1.5_mp.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "single-choice", "image_id": "1.jpg", "prompt": "Is the man's face clearly visible in the image?", "options": ["Yes", "No"], "target": [1], "benchmark_name": "Q-Bench", "response": "B."} 2 | {"problem_type": "single-choice", "image_id": "2.jpg", "prompt": "Are person1 and person2 the same person ?", "options": ["Yes they are related .", "person1 and person2 are not on the same side .", "Yes , they are students in the same class at school .", "Yes , person1 and person2 are the same ."], "target": [3], "benchmark_name": "SEED2", "response": "D."} 3 | {"problem_type": "single-choice", "image_id": "3.jpg", "prompt": "Where are the cast of the television show located in the image?", "options": ["In the foreground", "In the background", "In the center", "At the edges"], "target": [2], "benchmark_name": "SEED", "response": "A"} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='mixeval_x', 5 | version='1.0.0', 6 | author='MixEval-X team', 7 | author_email='jinjieni@nus.edu.sg', 8 | packages=find_packages(), 9 | install_requires=[ 10 | 'tiktoken==0.6.0', 11 | 'pandas==2.2.2', 12 | 'openai==1.30.5', 13 | 'httpx==0.27.0', 14 | 'nltk==3.8.1', 15 | 'numpy==1.26.3', 16 | 'tqdm==4.66.4', 17 | 'protobuf==4.25.3', 18 | 'python-dotenv==1.0.1', 19 | 'anthropic==0.28.0', 20 | 'google-generativeai==0.5.4', 21 | 'google-cloud-aiplatform==1.53.0', 22 | 'Pillow' 23 | ], 24 | package_data={ 25 | }, 26 | entry_points={ 27 | }, 28 | url='https://mixeval-x.github.io/', 29 | license='License :: OSI Approved :: MIT License', 30 | description='A real-world any-to-any benchmark and eval suite for models with diverse input and output modalities.', 31 | long_description=open('README.md').read(), 32 | long_description_content_type='text/markdown', 33 | ) 34 | -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/video2text/gemini_1_5_pro/gemini_1_5_pro_mp.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "single-choice", "video_id": "1.mp4", "prompt": "How did Eddie know Pam and Justin before Justin was killed?", "options": ["They were part of the theater company", "They were high school friends", "They went to college together", "They were cousins", "They were siblings"], "target": [0], "benchmark_name": "TVQA", "response": "A\n", "id": "1"} 2 | {"problem_type": "single-choice", "video_id": "2.mp4", "prompt": "Who did Ryan say died in a single vehicle car crash after he reads the state trooper's report?", "options": [" The mother of the victim, Anya Mishkin.", "The victim, Anya Mishkin.", "The witness of the car accident.", "The best friend of the victim, Anya Mishkin.", "The state trooper's officer partner."], "target": [0], "benchmark_name": "TVQA", "response": "A. The mother of", "id": "2"} 3 | {"problem_type": "single-choice", "video_id": "3.mp4", "prompt": "What are the people doing when they gather around the conference table? ", "options": ["They pound on the table and clap.", "They eat cake. ", "They talk on cell phones.", "They type on lap tops. ", "They sit quietly."], "target": [0], "benchmark_name": "TVQA", "response": "A. They pound on", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/video2text_hard/gemini_1_5_pro/gemini_1_5_pro_mp.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_type": "single-choice", "video_id": "1.mp4", "prompt": "How did Eddie know Pam and Justin before Justin was killed?", "options": ["They were part of the theater company", "They were high school friends", "They went to college together", "They were cousins", "They were siblings"], "target": [0], "benchmark_name": "TVQA", "response": "A\n", "id": "1"} 2 | {"problem_type": "single-choice", "video_id": "2.mp4", "prompt": "Who did Ryan say died in a single vehicle car crash after he reads the state trooper's report?", "options": [" The mother of the victim, Anya Mishkin.", "The victim, Anya Mishkin.", "The witness of the car accident.", "The best friend of the victim, Anya Mishkin.", "The state trooper's officer partner."], "target": [0], "benchmark_name": "TVQA", "response": "A. The mother of", "id": "2"} 3 | {"problem_type": "single-choice", "video_id": "3.mp4", "prompt": "What are the people doing when they gather around the conference table? ", "options": ["They pound on the table and clap.", "They eat cake. ", "They talk on cell phones.", "They type on lap tops. ", "They sit quietly."], "target": [0], "benchmark_name": "TVQA", "response": "A. They pound on", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/prompts/evaluation_prompts.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Besides the prompt, the input to the model also requires the an input field. 4 | # for text2text, the input field is the text specified in 'context'; 5 | # for image2text, the input field is the image specified in 'image_id'; 6 | # for audio2text, the input field is the audio specified in 'audio_id'; 7 | # for video2text, the input field is the video specified in 'video_id'. 8 | 9 | 10 | MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly." 11 | FREE_FORM_PROMPT = "Answer the question using a single word or phrase." 12 | 13 | def parse_options(options): 14 | option_letters = [chr(ord("A") + i) for i in range(len(options))] 15 | choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) 16 | return choices_str 17 | 18 | def construct_prompt_multichoice(entry): 19 | prompt = entry["prompt"] 20 | parsed_options = parse_options(entry["options"]) 21 | prompt = f"{prompt}\n{parsed_options}\n{MULTI_CHOICE_PROMPT}" 22 | return prompt 23 | 24 | def construct_prompt_freeform(entry): 25 | prompt = entry["prompt"] 26 | prompt = f"{prompt}\n{FREE_FORM_PROMPT}" 27 | return prompt 28 | 29 | 30 | if __name__ == "__main__": 31 | 32 | task_file_path = "data/tasks.json" 33 | with open(task_file_path, "r") as f: 34 | data = json.load(f) 35 | for id, entry in data.items(): 36 | if "options" in entry: 37 | prompt = construct_prompt_multichoice(entry) 38 | else: 39 | prompt = construct_prompt_freeform(entry) 40 | print(prompt) 41 | -------------------------------------------------------------------------------- /mixeval_x/prompts/evaluation_prompts_image2text.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Besides the prompt, the input to the model also requires the an input field. 4 | # for text2text, the input field is the text specified in 'context'; 5 | # for image2text, the input field is the image specified in 'image_id'; 6 | # for audio2text, the input field is the audio specified in 'audio_id'; 7 | # for video2text, the input field is the video specified in 'video_id'. 8 | 9 | 10 | MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly." 11 | FREE_FORM_PROMPT = "Answer the question using a single word or phrase." 12 | FREE_FORM_PROMPT_MMVET = "Answer the question using a single word, phrase, or sentence." 13 | 14 | def parse_options(options): 15 | option_letters = [chr(ord("A") + i) for i in range(len(options))] 16 | choices_str = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) 17 | return choices_str 18 | 19 | def construct_prompt_multichoice(entry): 20 | prompt = entry["prompt"] 21 | parsed_options = parse_options(entry["options"]) 22 | prompt = f"{prompt}\n{parsed_options}\n{MULTI_CHOICE_PROMPT}" 23 | return prompt 24 | 25 | def construct_prompt_freeform(entry): 26 | prompt = entry["prompt"] 27 | if entry["benchmark_name"] == "MM-Vet": 28 | prompt = f"{prompt}\n{FREE_FORM_PROMPT_MMVET}" 29 | else: 30 | prompt = f"{prompt}\n{FREE_FORM_PROMPT}" 31 | return prompt 32 | 33 | 34 | if __name__ == "__main__": 35 | 36 | task_file_path = "data/tasks.json" 37 | with open(task_file_path, "r") as f: 38 | data = json.load(f) 39 | for id, entry in data.items(): 40 | if "options" in entry: 41 | prompt = construct_prompt_multichoice(entry) 42 | else: 43 | prompt = construct_prompt_freeform(entry) 44 | print(prompt) 45 | -------------------------------------------------------------------------------- /mixeval_x/utils/count_token.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | 5 | import tiktoken 6 | 7 | from mix_eval.prompts.evaluation_prompts import ( 8 | construct_prompt_multichoice, 9 | construct_prompt_freeform, 10 | ) 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--split", 16 | type=str, 17 | choices=["close_freeform", "close_multichoice", "open", "all"], 18 | default="all", 19 | help="Split to evaluate." 20 | ) 21 | return parser.parse_args() 22 | 23 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"): 24 | """Returns the number of tokens used by a list of messages.""" 25 | try: 26 | encoding = tiktoken.encoding_for_model(model) 27 | except KeyError: 28 | encoding = tiktoken.get_encoding("cl100k_base") 29 | if model == "gpt-3.5-turbo-0613": # note: future models may deviate from this 30 | num_tokens = 0 31 | for message in messages: 32 | num_tokens += 4 # every message follows {role/name}\n{content}\n 33 | for key, value in message.items(): 34 | num_tokens += len(encoding.encode(value)) 35 | if key == "name": # if there's a name, the role is omitted 36 | num_tokens += -1 # role is always required and always 1 token 37 | num_tokens += 2 # every reply is primed with assistant 38 | return num_tokens 39 | else: 40 | raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}. 41 | See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 42 | 43 | def count_all_tokens_to_filter(args): 44 | number_tokens = 0 45 | 46 | if args.split == "all": 47 | splits = ["close_freeform", "close_multichoice", "open"] 48 | else: 49 | splits = [args.split] 50 | 51 | for split in splits: 52 | if split == "close_freeform": 53 | data_path = "mix_eval/data/text2text/text2text_closeended/free-form.json" 54 | elif split == "close_multichoice": 55 | data_path = "mix_eval/data/text2text/text2text_closeended/multiple-choice.json" 56 | elif split == "open": 57 | data_path = "mix_eval/data/text2text/text2text_openended.json" 58 | with open(data_path, "r") as f: 59 | data = json.load(f) 60 | for id, d in data.items(): 61 | if split == "close_multichoice": 62 | formated_input = construct_prompt_multichoice(d) 63 | number_tokens += num_tokens_from_messages([{"content": formated_input}]) 64 | elif split == "close_freeform": 65 | formated_input = construct_prompt_freeform(d) 66 | number_tokens += num_tokens_from_messages([{"content": formated_input}]) 67 | else: 68 | formated_input = '\n'.join(d["turns"]) 69 | number_tokens += num_tokens_from_messages([{"content": formated_input}]) + 1500 70 | 71 | print(f"Total number of tokens: {number_tokens}") 72 | 73 | 74 | 75 | if __name__ == '__main__': 76 | args = parse_args() 77 | count_all_tokens_to_filter(args) 78 | -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/text2image/dall-e-3/dall-e-3_t2i.jsonl: -------------------------------------------------------------------------------- 1 | {"first_turn_user_prompt": "Design a vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, with the City of Sydney\u2019s logo prominently displayed in the foreground, ensuring it catches the eye against the vivid backdrop.", "first_turn_caption": "A vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, with the City of Sydney\u2019s logo prominently displayed in the foreground, ensuring it catches the eye against the vivid backdrop.", "second_turn_user_prompt": " Remove the City of Sydney\u2019s logo.", "second_turn_caption": "A vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, ensuring it catches the eye against the vivid backdrop.", "gen_1st_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/1/1.jpg", "gen_2nd_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/1/2.jpg", "id": "1"} 2 | {"first_turn_user_prompt": "Illustrate an informative and detailed chart showcasing various honor societies along with their corresponding cord colors, each clearly labeled for easy identification. Please include a diverse range of societies to reflect a broad spectrum of academic achievements.", "first_turn_caption": "An informative and detailed chart showcasing various honor societies along with their corresponding cord colors, each clearly labeled for easy identification, reflecting a broad spectrum of academic achievements.", "second_turn_user_prompt": " Change the chart to pie chart format.", "second_turn_caption": "An informative and detailed pie chart showcasing various honor societies along with their corresponding cord colors, each clearly labeled for easy identification, reflecting a broad spectrum of academic achievements.", "gen_1st_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/2/1.jpg", "gen_2nd_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/2/2.jpg", "id": "2"} 3 | {"first_turn_user_prompt": "Depict a bustling cityscape transitioning from rain to sunshine, where the detailed weather parameters such as temperature gauges and wind speed indicators are artistically incorporated into the scene, capturing the dynamic change in weather conditions throughout the day.", "first_turn_caption": "A bustling cityscape transitioning from rain to sunshine, where detailed weather parameters such as temperature gauges and wind speed indicators are artistically incorporated into the scene, capturing the dynamic change in weather conditions throughout the day.", "second_turn_user_prompt": " Change the weather from rain to snow.", "second_turn_caption": "A bustling cityscape transitioning from rain to snow, where detailed weather parameters such as temperature gauges and wind speed indicators are artistically incorporated into the scene, capturing the dynamic change in weather conditions throughout the day.", "gen_1st_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/3/1.jpg", "gen_2nd_turn": "/home/nus-njj/projects/InstructioninWild/cache/MixEval-X/text2image/model_outputs/dall-e-3/3/2.jpg", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/image2action/gemini_1.5/gemini_1.5_i2a.jsonl: -------------------------------------------------------------------------------- 1 | {"image_id": "7.jpg", "task description": "Paint an object with blue color to resemble a blueberry.", "allowed actions": "[PickupObject], [PutObject], [Navigation], [OpenContainer], [CloseContainer], [SelectPaintTool], [MixColor], [ApplyPaint], [CleanBrush], [InspectObject], [DryObject]", "already executed steps": "[Navigation] , [SelectPaintTool] , [MixColor] , [ApplyPaint] ", "target": "[InspectObject] , [DryObject] ", "response": "My Planning: \n[InspectObject] , \n[ApplyPaint] , \n[DryObject] ,\n[CleanBrush] \n", "id": "1"} 2 | {"image_id": "18.jpg", "task description": "Analyze the displayed X-ray image by identifying and logging the visible features using the provided software.", "allowed actions": "[Navigate], [Click], [ReadText], [InputText], [OpenMenu], [SelectOption], [LoadImage], [AnalyzeImage], [LogFeature], [SaveData], [ToggleSwitch], [CloseApplication]", "already executed steps": "[Navigate] , [ReadText] , [OpenMenu] , [SelectOption] , [LoadImage] , [AnalyzeImage] ", "target": "[LogFeature] , [SaveData] , [CloseApplication] ", "response": "[Navigate] , [Click] , [LogFeature] , [Click] , [LogFeature] , [Click] , [LogFeature] , ... [Navigate] , [OpenMenu] , [SelectOption] , [CloseApplication] \n\n**Explanation:**\n\n1. **Feature Identification and Logging:** The planning assumes the software highlights identifiable features after analysis. I will navigate to each feature, click on it, and accurately describe it using the [LogFeature] action. This process will be repeated for all visible features.\n2. **Saving and Exiting:** Finally, I'll save the logged data and close the application.\n\n**Important Notes:**\n\n* **Invisible Features:** The prompt mentions potential invisible usable objects. In this context, there might be tools within the software (e.g., zoom, contrast adjustment) that could reveal additional features. If such tools are standard in X-ray analysis software, it would be logical to incorporate their use in the planning.\n* **Feature Specificity:** The exact features and descriptions would depend on the specific X-ray image and the analysis software's capabilities. \n* **Real-World Adaptation:** As a real-world agent, I would adapt my plan based on the software's specific interface and features. I would also seek clarification or assistance if I encounter any unforeseen issues. \n", "id": "2"} 3 | {"image_id": "5.jpg", "task description": "Prepare the AED for use by attaching the electrode pads onto the patient's chest and connect the pads to the AED.", "allowed actions": "[Navigate], [UnpackObject], [IdentifyObject], [PickupObject], [PlaceObject], [ConnectObject], [ActivateObject], [MonitorVitalSigns], [Listen], [Speak], [ReadText]", "already executed steps": "[Navigate] , [IdentifyObject] , [UnpackObject] , [PickupObject] , [PlaceObject] ", "target": "[ConnectObject] , [ActivateObject] , [MonitorVitalSigns], [ReadText] , [Listen]", "response": "Your Planning: \n[ConnectObject] , [ActivateObject] , [MonitorVitalSigns] , [Listen] \n", "id": "3"} -------------------------------------------------------------------------------- /mixeval_x/utils/gemini_api_script.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import time 3 | import random 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | from concurrent.futures import ThreadPoolExecutor 8 | 9 | import google.generativeai as genai 10 | 11 | 12 | ########################Gemini######################## 13 | class GeminiJudgeText2Image: 14 | def __init__(self, args): 15 | self.args = args 16 | self.JUDGE = "gemini-1.5-pro-latest" # "gemini-1.5-flash-latest" 17 | self.FIX_INTERVAL_SECOND = 0 18 | self.MAX_RETRY_NUM = 999999 19 | self.MAX_NEW_TOKENS = 999 20 | 21 | self.FORMAT_MAXRETRY = 10 22 | 23 | load_dotenv() 24 | genai.configure(api_key=os.getenv('k_g')) # set the api key here 25 | self.model = genai.GenerativeModel(self.JUDGE) 26 | 27 | self.safety_settings={ 28 | 'harm_category_harassment':'block_none', 29 | 'harm_category_hate_speech': 'block_none', 30 | 'harm_category_sexually_explicit': 'block_none', 31 | 'harm_category_dangerous_content': 'block_none' 32 | } 33 | 34 | def format_prompts(self, inputs): 35 | 36 | # input format logic here 37 | 38 | return formated 39 | 40 | def _GPT_decode(self, inputs): 41 | completion = self.model.generate_content( 42 | self.format_prompts(inputs), 43 | generation_config=genai.types.GenerationConfig( 44 | candidate_count=1, 45 | max_output_tokens=self.MAX_NEW_TOKENS, 46 | ), 47 | safety_settings=self.safety_settings, 48 | ) 49 | time.sleep(self.FIX_INTERVAL_SECOND) 50 | return completion.text 51 | 52 | 53 | def GPT_decode(self, inputs): 54 | delay = 1 55 | blocked = 0 56 | for i in range(self.MAX_RETRY_NUM): 57 | try: 58 | response_content = self._GPT_decode(inputs) 59 | return response_content 60 | except Exception as e: 61 | if 'quick accessor' in str(e) or 'block' in str(e): 62 | print("Content blocked, retrying ...") 63 | blocked += 1 64 | if blocked > 10: 65 | print("Blocked for too many times, using 'Response not available " 66 | "due to content restrictions.' as response, exiting...") 67 | return 'Response not available due to content restrictions.' 68 | elif 'quota' in str(e).lower() or 'limit' in str(e).lower(): 69 | exponential_base = 2 70 | delay *= exponential_base * (1 + random.random()) 71 | print(f"Error, retrying after {round(delay, 2)} seconds, {i+1}-th retry...") 72 | print(e) 73 | time.sleep(delay) 74 | continue 75 | else: 76 | print(f"Error in decode, retrying...") 77 | print(e) 78 | time.sleep(10) 79 | continue 80 | print(f"Failed after {self.MAX_RETRY_NUM} retries.") 81 | return 'Error' 82 | 83 | 84 | def annotate_p(self, task): 85 | # the annotation logic here 86 | 87 | pass 88 | 89 | 90 | def annotate_parallel(self, tasks): 91 | print(f"Parsing in parallel, in total {self.args.api_parallel_num} threads.") 92 | results = [] 93 | with ThreadPoolExecutor(self.args.api_parallel_num) as executor: 94 | for entry in tqdm( 95 | executor.map(self.annotate_p, tasks), total=len(tasks) 96 | ): 97 | results.append(entry) 98 | if None in results: 99 | raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.") 100 | return results -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | .env 3 | .vscode 4 | *.DS_Store 5 | resampled_output/ 6 | misc/ 7 | plot/ 8 | mixeval_x/data/ 9 | post_process/ 10 | unit_tests/ 11 | eval_scripts/ 12 | /mix_eval/ 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | cover/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | db.sqlite3 75 | db.sqlite3-journal 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | .pybuilder/ 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # IPython 95 | profile_default/ 96 | ipython_config.py 97 | 98 | # pyenv 99 | # For a library or package, you might want to ignore these files since the code is 100 | # intended to run in multiple environments; otherwise, check them in: 101 | # .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # poetry 111 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 112 | # This is especially recommended for binary packages to ensure reproducibility, and is more 113 | # commonly ignored for libraries. 114 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 115 | #poetry.lock 116 | 117 | # pdm 118 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 119 | #pdm.lock 120 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 121 | # in version control. 122 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 123 | .pdm.toml 124 | .pdm-python 125 | .pdm-build/ 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | # PyCharm 171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 173 | # and can be added to the global gitignore or merged into this file. For a more nuclear 174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 175 | #.idea/ 176 | -------------------------------------------------------------------------------- /mixeval_x/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | # import torch 4 | import numpy as np 5 | import random 6 | import re 7 | 8 | import tiktoken 9 | 10 | def set_seed(seed: int = 42) -> None: 11 | np.random.seed(seed) 12 | random.seed(seed) 13 | # torch.manual_seed(seed) 14 | # torch.cuda.manual_seed(seed) 15 | # # When running on the CuDNN backend, two further options must be set 16 | # torch.backends.cudnn.deterministic = True 17 | # torch.backends.cudnn.benchmark = False 18 | # Set a fixed value for the hash seed 19 | os.environ["PYTHONHASHSEED"] = str(seed) 20 | # print(f"Random seed set as {seed}") 21 | 22 | def cache_status(args, status): 23 | status_path = os.path.join( 24 | args.output_dir, 25 | args.model_name, 26 | args.benchmark, 27 | args.version, 28 | f'status_{args.split}.json' 29 | ) 30 | with open(status_path, 'w') as f: 31 | arg_dict = args.__dict__ 32 | status_dict = { 33 | 'args': arg_dict, 34 | 'status': status 35 | } 36 | json.dump(status_dict, f, indent=4) 37 | 38 | def read_status(args): 39 | status_path = os.path.join( 40 | args.output_dir, 41 | args.model_name, 42 | args.benchmark, 43 | args.version, 44 | f'status_{args.split}.json' 45 | ) 46 | with open(status_path, 'r') as f: 47 | return json.load(f) 48 | 49 | def dict_equal(dict1, dict2, keys_to_ignore=['resume', 'compute_score_from_judged_file', 'inference_only']): 50 | modified_dict1 = dict1.copy() 51 | modified_dict2 = dict2.copy() 52 | for key in keys_to_ignore: 53 | modified_dict1.pop(key, None) 54 | modified_dict2.pop(key, None) 55 | return modified_dict1 == modified_dict2 56 | 57 | def log_error(message, path): 58 | with open(path, 'a') as f: 59 | f.write(f"{message}\n") 60 | 61 | def get_gpu_memory(max_gpus=None): 62 | """Get available memory for each GPU.""" 63 | import torch 64 | 65 | gpu_memory = [] 66 | num_gpus = ( 67 | torch.cuda.device_count() 68 | if max_gpus is None 69 | else min(max_gpus, torch.cuda.device_count()) 70 | ) 71 | 72 | for gpu_id in range(num_gpus): 73 | with torch.cuda.device(gpu_id): 74 | device = torch.cuda.current_device() 75 | gpu_properties = torch.cuda.get_device_properties(device) 76 | total_memory = gpu_properties.total_memory / (1024**3) 77 | allocated_memory = torch.cuda.memory_allocated() / (1024**3) 78 | available_memory = total_memory - allocated_memory 79 | gpu_memory.append(available_memory) 80 | return gpu_memory 81 | 82 | def is_meaningful(s): 83 | # Check for alphabetic characters (including other languages) and numeric digits 84 | if re.search(r'[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0370-\u03FF\u0400-\u04FF\u0500-\u052F\u2C00-\u2C5F\uA640-\uA69F\uAC00-\uD7AF\u4E00-\u9FFF]', s): 85 | return True 86 | else: 87 | return False 88 | 89 | def extract_basemodel_response_3e(response): 90 | _response = response.split('\n\n\nQuestion')[0] 91 | if is_meaningful(_response): # non-trival response 92 | return _response 93 | else: 94 | return response 95 | 96 | def extract_basemodel_response_2e(response): 97 | _response = response.split('\n\n')[0] 98 | if is_meaningful(_response): # non-trival response 99 | return _response 100 | else: 101 | return response 102 | 103 | def num_tokens_from_message(message, model="gpt-3.5-turbo-0613"): 104 | try: 105 | encoding = tiktoken.encoding_for_model(model) 106 | except KeyError: 107 | encoding = tiktoken.get_encoding("cl100k_base") 108 | if model == "gpt-3.5-turbo-0613": # note: future models may deviate from this 109 | num_tokens = len(encoding.encode(message)) 110 | return num_tokens 111 | else: 112 | raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}. 113 | See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /mixeval_x/utils/judge_freeform_parser.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import time 3 | import random 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | from concurrent.futures import ThreadPoolExecutor 8 | from openai import OpenAI 9 | from openai._exceptions import RateLimitError, BadRequestError 10 | from httpx import Timeout 11 | 12 | from mixeval_x.prompts.judge_prompts import ( 13 | image2text_gpt_judge_for_closeended_freeform, 14 | audio2text_gpt_judge_for_closeended_freeform, 15 | video2text_gpt_judge_for_closeended_freeform, 16 | ) 17 | from mixeval_x.utils.common_utils import extract_basemodel_response_3e 18 | 19 | ########################ChatGPT######################## 20 | class ChatGPTJudgeCloseendFreeform: 21 | def __init__(self, args): 22 | self.args = args 23 | self.JUDGE = args.freeform_judge 24 | self.FIX_INTERVAL_SECOND = 0 25 | self.MAX_RETRY_NUM = 99 26 | self.MAX_NEW_TOKENS = 999 27 | 28 | load_dotenv() 29 | self.client = OpenAI( 30 | api_key=os.getenv('MODEL_PARSER_API'), 31 | timeout=Timeout(timeout=60.0, connect=5.0) 32 | ) 33 | 34 | def format_prompts(self, inputs): 35 | prompt, gold_ans, response = inputs 36 | gold_ans = '; '.join([f" {ans}" for i, ans in enumerate(gold_ans)]) 37 | if self.args.benchmark in ['image2text', 'image2text_hard']: 38 | formated = image2text_gpt_judge_for_closeended_freeform(prompt, gold_ans, response) 39 | elif self.args.benchmark in ['audio2text', 'audio2text_hard']: 40 | formated = audio2text_gpt_judge_for_closeended_freeform(prompt, gold_ans, response) 41 | elif self.args.benchmark in ['video2text', 'video2text_hard']: 42 | formated = video2text_gpt_judge_for_closeended_freeform(prompt, gold_ans, response) 43 | else: 44 | raise ValueError(f"Benchmark {self.args.benchmark} not supported in the closeended parser.") 45 | return formated 46 | 47 | def _GPT_decode(self, inputs): 48 | completion = self.client.chat.completions.create( 49 | model=self.JUDGE, 50 | response_format={ "type": 'text'}, 51 | max_tokens=self.MAX_NEW_TOKENS, 52 | messages=self.format_prompts(inputs), 53 | ) 54 | time.sleep(self.FIX_INTERVAL_SECOND) 55 | return completion 56 | 57 | 58 | def GPT_decode(self, inputs): 59 | delay = 1 60 | blocked = 0 61 | for i in range(self.MAX_RETRY_NUM): 62 | try: 63 | completion = self._GPT_decode(inputs) 64 | return completion 65 | except RateLimitError as e: 66 | exponential_base = 2 67 | delay *= exponential_base * (1 + random.random()) 68 | print(f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...") 69 | print(e) 70 | time.sleep(delay) 71 | continue 72 | except BadRequestError as e: 73 | blocked += 1 74 | if blocked >= 10: 75 | print("Blocked too many times, skipping...") 76 | return 'Blocked' 77 | print(f"Input is blocked, retrying...") 78 | print(e) 79 | time.sleep(1) 80 | continue 81 | except Exception as e: 82 | print(f"Error in GPT_decode, retrying...") 83 | print(e) 84 | time.sleep(1) 85 | continue 86 | print(f"Failed after {self.MAX_RETRY_NUM} retries.") 87 | return 'Error' 88 | 89 | 90 | def annotate_p(self, task): 91 | prompt = task['prompt'] 92 | gold_ans = task['target'] 93 | response = task['response'] 94 | 95 | if hasattr(self.args, 'model_type'): 96 | if self.args.model_type == 'BaseModel': 97 | response = extract_basemodel_response_3e(response) 98 | task['response_extracted'] = response 99 | elif self.args.model_type == 'ChatModel': 100 | pass 101 | elif self.args.model_type == 'APIModelBase': 102 | pass 103 | else: 104 | raise ValueError(f"Model type {self.args.model_type} not supported.") 105 | 106 | if not isinstance(gold_ans, list): 107 | print(f"Invalid target: {gold_ans}") 108 | return None 109 | 110 | inputs = (prompt, gold_ans, response) 111 | 112 | completion = self.GPT_decode(inputs) 113 | if completion == 'Error': 114 | print(f"Error in GPT_decode, the entry {task} will be retried later...") 115 | task['judge_response'] = None 116 | return task 117 | elif completion == 'Blocked': 118 | print(f"{task}: \n\nBlocked, the entry treated as bad entry.") 119 | task['judge_response'] = '[[0.0]]' 120 | return task 121 | annotation = completion.choices[0].message.content 122 | task['judge_response'] = annotation 123 | return task 124 | 125 | 126 | def annotate_parallel(self, tasks): 127 | print(f"Parsing in parallel, in total {self.args.api_parallel_num} threads.") 128 | results = [] 129 | with ThreadPoolExecutor(self.args.api_parallel_num) as executor: 130 | for entry in tqdm( 131 | executor.map(self.annotate_p, tasks), total=len(tasks) 132 | ): 133 | results.append(entry) 134 | if None in results: 135 | raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.") 136 | return results 137 | 138 | ########################Claude 3######################## 139 | class ClaudeJudgeCloseendFreeform: 140 | def __init__(self): 141 | raise NotImplementedError 142 | 143 | 144 | ########################Gemini######################## 145 | class GeminiJudgeCloseendFreeform: 146 | def __init__(self): 147 | raise NotImplementedError -------------------------------------------------------------------------------- /mixeval_x/utils/judge_multichoice_parser.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import time 3 | import random 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | from concurrent.futures import ThreadPoolExecutor 8 | from openai import OpenAI 9 | from openai._exceptions import RateLimitError, BadRequestError 10 | from httpx import Timeout 11 | 12 | from mixeval_x.prompts.judge_prompts import ( 13 | image2text_gpt_judge_for_closeended_multiplechoice, 14 | video2text_gpt_judge_for_closeended_multiplechoice, 15 | ) 16 | from mixeval_x.utils.common_utils import extract_basemodel_response_2e 17 | 18 | ########################ChatGPT######################## 19 | class ChatGPTJudgeCloseendMultichoice: 20 | def __init__(self, args): 21 | self.args = args 22 | 23 | self.JUDGE = args.multichoice_judge 24 | self.FIX_INTERVAL_SECOND = 0 25 | self.MAX_RETRY_NUM = 99 26 | self.MAX_NEW_TOKENS = 999 27 | 28 | load_dotenv() 29 | self.client = OpenAI( 30 | api_key=os.getenv('MODEL_PARSER_API'), 31 | timeout=Timeout(timeout=20.0, connect=5.0) 32 | ) 33 | 34 | def format_prompts(self, inputs): 35 | prompt, options, response = inputs 36 | option_letters = [chr(ord("A") + i) for i in range(len(options))] 37 | options = "\n".join([f"{option_letter}. {option}" for option_letter, option in zip(option_letters, options)]) 38 | if self.args.benchmark in ['image2text', 'image2text_hard']: 39 | formated = image2text_gpt_judge_for_closeended_multiplechoice(prompt, options, response) 40 | elif self.args.benchmark in ['video2text', 'video2text_hard']: 41 | formated = video2text_gpt_judge_for_closeended_multiplechoice(prompt, options, response) 42 | else: 43 | raise ValueError(f"Benchmark {self.args.benchmark} not supported in the closeended parser.") 44 | return formated 45 | 46 | def _GPT_decode(self, inputs): 47 | completion = self.client.chat.completions.create( 48 | model=self.JUDGE, 49 | response_format={ "type": 'text'}, 50 | max_tokens=self.MAX_NEW_TOKENS, 51 | messages=self.format_prompts(inputs), 52 | ) 53 | time.sleep(self.FIX_INTERVAL_SECOND) 54 | return completion 55 | 56 | 57 | def GPT_decode(self, inputs): 58 | delay = 1 59 | blocked = 0 60 | for i in range(self.MAX_RETRY_NUM): 61 | try: 62 | completion = self._GPT_decode(inputs) 63 | return completion 64 | except RateLimitError as e: 65 | exponential_base = 2 66 | delay *= exponential_base * (1 + random.random()) 67 | print(f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...") 68 | print(e) 69 | time.sleep(delay) 70 | continue 71 | except BadRequestError as e: 72 | blocked += 1 73 | if blocked >= 10: 74 | print("Blocked too many times, skipping...") 75 | return 'Blocked' 76 | print(f"Input is blocked, retrying...") 77 | print(e) 78 | time.sleep(1) 79 | continue 80 | except Exception as e: 81 | print(f"Error in GPT_decode, retrying...") 82 | print(e) 83 | time.sleep(1) 84 | continue 85 | print(f"Failed after {self.MAX_RETRY_NUM} retries.") 86 | return 'Error' 87 | 88 | 89 | def annotate_p(self, task): 90 | prompt = task['prompt'] 91 | options = task['options'] 92 | response = task['response'] 93 | 94 | if hasattr(self.args, 'model_type'): 95 | if self.args.model_type == 'BaseModel': 96 | response = extract_basemodel_response_2e(response) 97 | task['response_extracted'] = response 98 | elif self.args.model_type == 'ChatModel': 99 | pass 100 | elif self.args.model_type == 'APIModelBase': 101 | pass 102 | else: 103 | raise ValueError(f"Model type {self.args.model_type} not supported.") 104 | 105 | if not isinstance(options, list): 106 | print(f"Invalid target: {options}") 107 | return None 108 | 109 | inputs = (prompt, options, response) 110 | 111 | completion = self.GPT_decode(inputs) 112 | if completion == 'Error': 113 | print(f"Error in GPT_decode, the entry {task} will be retried later...") 114 | task['judge_response'] = None 115 | return task 116 | elif completion == 'Blocked': 117 | print(f"{task}: \n\nBlocked, the entry treated as bad entry. Randomly assigning a choice.") 118 | options = task['options'] 119 | option_letters = [chr(ord("A") + i) for i in range(len(options))] 120 | task['judge_response'] = f"[[{random.choice(option_letters)}]]" 121 | return task 122 | annotation = completion.choices[0].message.content 123 | task['judge_response'] = annotation 124 | return task 125 | 126 | 127 | def annotate_parallel(self, tasks): 128 | print(f"Parsing in parallel, in total {self.args.api_parallel_num} threads.") 129 | results = [] 130 | with ThreadPoolExecutor(self.args.api_parallel_num) as executor: 131 | for entry in tqdm( 132 | executor.map(self.annotate_p, tasks), total=len(tasks) 133 | ): 134 | results.append(entry) 135 | if None in results: 136 | raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.") 137 | return results 138 | 139 | ########################Claude 3######################## 140 | class ClaudeJudgeCloseendMultichoice: 141 | def __init__(self): 142 | raise NotImplementedError 143 | 144 | 145 | ########################Gemini######################## 146 | class GeminiJudgeCloseendMultichoice: 147 | def __init__(self): 148 | raise NotImplementedError -------------------------------------------------------------------------------- /mixeval_x/utils/judge_text2action.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import time 3 | import random 4 | import os 5 | from dotenv import load_dotenv 6 | import base64 7 | import re 8 | import ast 9 | 10 | from concurrent.futures import ThreadPoolExecutor 11 | from openai import OpenAI 12 | from openai._exceptions import RateLimitError, BadRequestError 13 | from httpx import Timeout 14 | 15 | from mixeval_x.prompts.judge_prompts import ( 16 | text2action_gpt_judge, 17 | ) 18 | 19 | ########################ChatGPT######################## 20 | class ChatGPTJudgeText2Action: 21 | def __init__(self, args): 22 | self.args = args 23 | self.JUDGE = args.judge_model 24 | self.FIX_INTERVAL_SECOND = 0 25 | self.MAX_RETRY_NUM = 999999 26 | self.MAX_NEW_TOKENS = 999 27 | 28 | self.FORMAT_MAXRETRY = 10 29 | 30 | load_dotenv() 31 | self.client = OpenAI( 32 | api_key=os.getenv('MODEL_PARSER_API'), 33 | timeout=Timeout(timeout=60.0, connect=5.0) 34 | ) 35 | 36 | @staticmethod 37 | def get_score_from_judge(judge_response): 38 | """ 39 | Get the score from the judge response. 40 | """ 41 | one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]") 42 | one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]") 43 | 44 | match = re.search(one_score_pattern, judge_response) 45 | if not match: 46 | match = re.search(one_score_pattern_backup, judge_response) 47 | 48 | if match: 49 | rating = ast.literal_eval(match.groups()[0]) 50 | else: 51 | rating = -1 52 | 53 | return float(rating) 54 | 55 | def format_prompts(self, inputs, mode, history=[]): 56 | 57 | if mode == 'turn_1': 58 | formated = text2action_gpt_judge(*inputs) 59 | elif mode == 'append_message' and history: 60 | history.append(inputs) 61 | formated = history 62 | else: 63 | raise ValueError(f"Invalid mode: {mode}.") 64 | 65 | return formated 66 | 67 | def _GPT_decode(self, inputs, mode, history=[]): 68 | completion = self.client.chat.completions.create( 69 | model=self.JUDGE, 70 | response_format={ "type": 'text'}, 71 | max_tokens=self.MAX_NEW_TOKENS, 72 | messages=self.format_prompts(inputs, mode, history), 73 | ) 74 | time.sleep(self.FIX_INTERVAL_SECOND) 75 | return completion 76 | 77 | def GPT_decode(self, inputs, mode, history=[]): 78 | delay = 1 79 | blocked = 0 80 | for i in range(self.MAX_RETRY_NUM): 81 | try: 82 | completion = self._GPT_decode(inputs, mode, history) 83 | return completion 84 | except RateLimitError as e: 85 | exponential_base = 2 86 | delay *= exponential_base * (1 + random.random()) 87 | print(f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...") 88 | print(e) 89 | time.sleep(delay) 90 | continue 91 | except BadRequestError as e: 92 | blocked += 1 93 | if blocked >= 10: 94 | print("Blocked too many times, skipping...") 95 | return 'Blocked' 96 | print(f"Input is blocked, retrying...") 97 | print(e) 98 | time.sleep(1) 99 | continue 100 | except Exception as e: 101 | print(f"Error in GPT_decode, retrying...") 102 | print(e) 103 | time.sleep(1) 104 | continue 105 | print(f"Failed after {self.MAX_RETRY_NUM} retries.") 106 | return 'Error' 107 | 108 | def annotate_p(self, task): 109 | task_description = task['task description'] 110 | allowed_actions = task['allowed actions'] 111 | visible_objects = task['visible objects'] 112 | already_executed_steps = task['already executed steps'] 113 | target = task['target'] 114 | model_response = task['response'] 115 | 116 | # first turn 117 | inputs = (task_description, allowed_actions, visible_objects, already_executed_steps, target, model_response) 118 | 119 | completion = self.GPT_decode(inputs, 'turn_1') 120 | if completion == 'Error': 121 | print(f"Error in GPT_decode, the entry treated as bad entry.") 122 | task['judge_response'] = '[[0.0]]' 123 | elif completion == 'Blocked': 124 | print(f"{task}: \n\nBlocked, the entry treated as bad entry.") 125 | task['judge_response'] = '[[0.0]]' 126 | else: 127 | annotation = completion.choices[0].message.content 128 | task['judge_response'] = annotation 129 | for i in range(self.FORMAT_MAXRETRY): 130 | if self.get_score_from_judge(annotation) != -1: 131 | task['judge_score'] = self.get_score_from_judge(annotation) 132 | break 133 | else: 134 | print(f"No score found in the response, retrying...") 135 | history = text2action_gpt_judge(*inputs) 136 | completion = self.GPT_decode({"role": "user", "content": "Continue your judgment and finish by outputting a final score with the above-mentioned format."}, 'append_message', history) 137 | annotation = completion.choices[0].message.content 138 | 139 | if self.get_score_from_judge(annotation) != -1: 140 | task['judge_score'] = self.get_score_from_judge(annotation) 141 | else: 142 | task['judge_score'] = None 143 | print(f"No score found in the response, please inspect and retry.") 144 | 145 | 146 | return task 147 | 148 | def annotate_parallel(self, tasks): 149 | print(f"Parsing in parallel, in total {self.args.api_parallel_num} threads.") 150 | results = [] 151 | with ThreadPoolExecutor(self.args.api_parallel_num) as executor: 152 | for entry in tqdm( 153 | executor.map(self.annotate_p, tasks), total=len(tasks) 154 | ): 155 | results.append(entry) 156 | if None in results: 157 | raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.") 158 | return results 159 | 160 | ########################Claude 3######################## 161 | class ClaudeJudgeText2Action: 162 | def __init__(self): 163 | raise NotImplementedError 164 | 165 | 166 | ########################Gemini######################## 167 | class GeminiJudgeText2Action: 168 | def __init__(self): 169 | raise NotImplementedError -------------------------------------------------------------------------------- /docs/model_output_formats.md: -------------------------------------------------------------------------------- 1 | # Model Output Formats 2 | 3 | As illustrated in the README.md, you can run our provided command after preparing your model output files as specified in this document. For example, below is our provided grading command for the Image2Text benchmark: 4 | 5 | ``` 6 | python -m mixeval_x.compute_metrics_mmu \ 7 | --benchmark image2text \ 8 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 9 | --models_to_eval \ 10 | gemini_1_5_pro \ 11 | gemini_1_5_flash 12 | ``` 13 | 14 | Here, the `--model_response_dir` specifies the grandparent directory of your model directory. `--benchmark` specifies the parent directory name of your model directory. Your model directory contains the model output files. The model directory name is your model name that will be specified by `--models_to_eval`. Generally, your prepared model output files should have a structure like this: 15 | 16 | ``` 17 | └── model_response_dir 18 | │ 19 | └── benchmark 20 | │ 21 | └── your_model_name 22 | │ 23 | ├── file1.jsonl 24 | │ 25 | └── file2.jsonl 26 | │ 27 | └── ... 28 | ``` 29 | 30 | > The difference between model input (the benchmark data) and output (the model response file) is just the `"response"` field, i.e., each entry in your output file should keep all key-value pairs (including the 'id') of the input entry, with an additional `"response"` field representing the model's output. 31 | 32 | > The model scores for each model will be saved in the model directory; scores for all graded models will be saved in the benchmark directory. 33 | 34 | 🚨 **We show example model output structures and files in `mixeval_x/examples/`**. 35 | 36 | The exact output structures and data formats are detailed below. 37 | 38 |


39 | 40 | ## MMU Tasks (Image2Text, Video2Text, and Audio2Text) 41 | 42 | ### Structure 43 | ``` 44 | └── model_response_dir 45 | │ 46 | └── benchmark 47 | │ 48 | └── your_model_name 49 | │ 50 | ├── your_model_name_ff.jsonl 51 | │ 52 | └── your_model_name_mp.jsonl 53 | ``` 54 | > Note that the Audio2Text benchmark doesn't have the multiple-choice subset, therefore the "`_mp`" file is not required. 55 | 56 | ### Output Data Format 57 | 58 | Free-form (`your_model_name_ff.jsonl`): 59 | ``` 60 | [ 61 | { 62 | "id": "1", 63 | "problem_type": "free-form", 64 | "image_id": "1.jpg", 65 | "prompt": "Where are the people that are standing looking at?", 66 | "target": ["away"], 67 | "benchmark_name": "GQA", 68 | "response": "The kite" 69 | }, 70 | ... 71 | ] 72 | ``` 73 | 74 | Multiple-choice (`your_model_name_mp.jsonl`): 75 | ``` 76 | [ 77 | { 78 | "problem_type": "single-choice", 79 | "image_id": "1.jpg", 80 | "prompt": "Is the man's face clearly visible in the image?", 81 | "options": ["Yes", "No"], 82 | "target": [1], 83 | "benchmark_name": "Q-Bench", 84 | "response": "B." 85 | }, 86 | ... 87 | ] 88 | ``` 89 | 90 | ## Agent Tasks 91 | 92 | ### Text2Action 93 | 94 | Structure: 95 | ``` 96 | └── model_response_dir 97 | │ 98 | └── benchmark 99 | | 100 | └── your_model_name 101 | │ 102 | └── your_model_name_t2a.jsonl 103 | ``` 104 | 105 | Output Data Format (`your_model_name_t2a.jsonl`): 106 | ``` 107 | [ 108 | { 109 | "id": "1", 110 | "task description": "Take a photo of the Oso Nabukete cave entrance.", 111 | "allowed actions": "[Navigation], [Photograph], [InteractWithObject], [PickupObject], [PutObject], [UseObject], [ChargeDevice], [CheckBattery], [AdjustSettings], [Wait], [Speak], [Listen]", 112 | "visible objects": ", , , , , , , , , , , ", "already executed steps": "[Navigation] , [PickupObject] , [CheckBattery] ", 113 | "target": "[AdjustSettings] , [Photograph] ", 114 | "response": "[AdjustSettings] , [ChargeDevice] , [PickupObject] , [UseObject] , [Photograph] " 115 | }, 116 | ... 117 | ] 118 | ``` 119 | 120 | ### Image2Action 121 | Structure: 122 | ``` 123 | └── model_response_dir 124 | │ 125 | └── benchmark 126 | | 127 | └── your_model_name 128 | │ 129 | └── your_model_name_i2a.jsonl 130 | ``` 131 | 132 | Output Data Format (`your_model_name_i2a.jsonl`): 133 | ``` 134 | [ 135 | { 136 | "id": "1", 137 | "image_id": "7.jpg", 138 | "task description": "Paint an object with blue color to resemble a blueberry.", 139 | "allowed actions": "[PickupObject], [PutObject], [Navigation], [OpenContainer], [CloseContainer], [SelectPaintTool], [MixColor], [ApplyPaint], [CleanBrush], [InspectObject], [DryObject]", 140 | "already executed steps": "[Navigation] , [SelectPaintTool] , [MixColor] , [ApplyPaint] ", 141 | "target": "[InspectObject] , [DryObject] ", 142 | "response": "[InspectObject] , [DryObject] , [CleanBrush] " 143 | }, 144 | ... 145 | ] 146 | ``` 147 | 148 | ## MMG Tasks (only supports Text2Image) 149 | 150 | ### Text2Image 151 | Structure: 152 | ``` 153 | └── model_response_dir 154 | │ 155 | └── benchmark 156 | | 157 | └── your_model_name 158 | │ 159 | └── your_model_name_t2i.jsonl 160 | ``` 161 | 162 | Output Data Format (`your_model_name_t2i.jsonl`): 163 | ``` 164 | [ 165 | { 166 | "id": "1", 167 | "first_turn_user_prompt": "Design a vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, with the City of Sydney\u2019s logo prominently displayed in the foreground, ensuring it catches the eye against the vivid backdrop.", 168 | "first_turn_caption": "A vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, with the City of Sydney\u2019s logo prominently displayed in the foreground, ensuring it catches the eye against the vivid backdrop.", 169 | "second_turn_user_prompt": " Remove the City of Sydney\u2019s logo.", 170 | "second_turn_caption": "A vibrant cityscape banner featuring the iconic Sydney Opera House and Harbour Bridge under a dazzling sunset, ensuring it catches the eye against the vivid backdrop.", 171 | "gen_1st_turn": "THE DIR PATH/1.jpg", 172 | "gen_2nd_turn": "THE DIR PATH/2.jpg", 173 | }, 174 | ... 175 | ] 176 | ``` 177 | 178 | 179 | -------------------------------------------------------------------------------- /mixeval_x/utils/judge_image2action.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import time 3 | import random 4 | import os 5 | from dotenv import load_dotenv 6 | import base64 7 | import re 8 | import ast 9 | 10 | from concurrent.futures import ThreadPoolExecutor 11 | from openai import OpenAI 12 | from openai._exceptions import RateLimitError, BadRequestError 13 | from httpx import Timeout 14 | 15 | from mixeval_x.prompts.judge_prompts import ( 16 | image2action_gpt_judge, 17 | ) 18 | 19 | ########################ChatGPT######################## 20 | class ChatGPTJudgeImage2Action: 21 | def __init__(self, args): 22 | self.args = args 23 | self.JUDGE = args.judge_model 24 | self.FIX_INTERVAL_SECOND = 0 25 | self.MAX_RETRY_NUM = 999999 26 | self.MAX_NEW_TOKENS = 999 27 | 28 | self.FORMAT_MAXRETRY = 10 29 | 30 | image2action_example_image_path = os.path.join(args.image2action_image_dir, 'example_1.jpg') 31 | if not os.path.exists(image2action_example_image_path): 32 | raise ValueError(f"Example image: 'example_1.jpg' for image2action grading not found in the image directory: {args.image2action_image_dir}. Please use the official image directory for Image2Action.") 33 | self.example_image = self.encode_image(image2action_example_image_path) 34 | 35 | load_dotenv() 36 | self.client = OpenAI( 37 | api_key=os.getenv('MODEL_PARSER_API'), 38 | timeout=Timeout(timeout=60.0, connect=5.0) 39 | ) 40 | 41 | @staticmethod 42 | def encode_image(image_path): 43 | if not os.path.exists(image_path): 44 | image_path = image_path.replace('.jpg', '.JPG') 45 | with open(image_path, "rb") as image_file: 46 | return base64.b64encode(image_file.read()).decode('utf-8') 47 | 48 | @staticmethod 49 | def get_score_from_judge(judge_response): 50 | """ 51 | Get the score from the judge response. 52 | """ 53 | one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]") 54 | one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]") 55 | 56 | match = re.search(one_score_pattern, judge_response) 57 | if not match: 58 | match = re.search(one_score_pattern_backup, judge_response) 59 | 60 | if match: 61 | rating = ast.literal_eval(match.groups()[0]) 62 | else: 63 | rating = -1 64 | 65 | return float(rating) 66 | 67 | def format_prompts(self, inputs, mode, history=[]): 68 | 69 | if mode == 'turn_1': 70 | formated = image2action_gpt_judge(*inputs) 71 | elif mode == 'append_message' and history: 72 | history.append(inputs) 73 | formated = history 74 | else: 75 | raise ValueError(f"Invalid mode: {mode}.") 76 | 77 | return formated 78 | 79 | def _GPT_decode(self, inputs, mode, history=[]): 80 | completion = self.client.chat.completions.create( 81 | model=self.JUDGE, 82 | response_format={ "type": 'text'}, 83 | max_tokens=self.MAX_NEW_TOKENS, 84 | messages=self.format_prompts(inputs, mode, history), 85 | ) 86 | time.sleep(self.FIX_INTERVAL_SECOND) 87 | return completion 88 | 89 | def GPT_decode(self, inputs, mode, history=[]): 90 | delay = 1 91 | blocked = 0 92 | for i in range(self.MAX_RETRY_NUM): 93 | try: 94 | completion = self._GPT_decode(inputs, mode, history) 95 | return completion 96 | except RateLimitError as e: 97 | exponential_base = 2 98 | delay *= exponential_base * (1 + random.random()) 99 | print(f"RateLimitError, retrying after {round(delay, 2)} seconds, {i+1}-th retry...") 100 | print(e) 101 | time.sleep(delay) 102 | continue 103 | except BadRequestError as e: 104 | blocked += 1 105 | if blocked >= 10: 106 | print("Blocked too many times, skipping...") 107 | return 'Blocked' 108 | print(f"Input is blocked, retrying...") 109 | print(e) 110 | time.sleep(1) 111 | continue 112 | except Exception as e: 113 | print(f"Error in GPT_decode, retrying...") 114 | print(e) 115 | time.sleep(1) 116 | continue 117 | print(f"Failed after {self.MAX_RETRY_NUM} retries.") 118 | return 'Error' 119 | 120 | def annotate_p(self, task): 121 | task_description = task['task description'] 122 | allowed_actions = task['allowed actions'] 123 | image_id = task['image_id'] 124 | already_executed_steps = task['already executed steps'] 125 | target = task['target'] 126 | model_response = task['response'] 127 | image = self.encode_image(os.path.join(self.args.image2action_image_dir, image_id)) 128 | 129 | # first turn 130 | inputs = (self.example_image, task_description, allowed_actions, image, already_executed_steps, target, model_response) 131 | 132 | completion = self.GPT_decode(inputs, 'turn_1') 133 | if completion == 'Error': 134 | print(f"Error in GPT_decode, the entry treated as bad entry.") 135 | task['judge_response'] = '[[0.0]]' 136 | elif completion == 'Blocked': 137 | print(f"{task}: \n\nBlocked, the entry treated as bad entry.") 138 | task['judge_response'] = '[[0.0]]' 139 | else: 140 | annotation = completion.choices[0].message.content 141 | task['judge_response'] = annotation 142 | for i in range(self.FORMAT_MAXRETRY): 143 | if self.get_score_from_judge(annotation) != -1: 144 | task['judge_score'] = self.get_score_from_judge(annotation) 145 | break 146 | else: 147 | print(f"No score found in the response, retrying...") 148 | history = image2action_gpt_judge(*inputs) 149 | completion = self.GPT_decode({"role": "user", "content": "Continue your judgment and finish by outputting a final score with the above-mentioned format."}, 'append_message', history) 150 | annotation = completion.choices[0].message.content 151 | 152 | if self.get_score_from_judge(annotation) != -1: 153 | task['judge_score'] = self.get_score_from_judge(annotation) 154 | else: 155 | task['judge_score'] = None 156 | print(f"No score found in the response, please inspect and retry.") 157 | 158 | 159 | return task 160 | 161 | def annotate_parallel(self, tasks): 162 | print(f"Parsing in parallel, in total {self.args.api_parallel_num} threads.") 163 | results = [] 164 | with ThreadPoolExecutor(self.args.api_parallel_num) as executor: 165 | for entry in tqdm( 166 | executor.map(self.annotate_p, tasks), total=len(tasks) 167 | ): 168 | results.append(entry) 169 | if None in results: 170 | raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.") 171 | return results 172 | 173 | ########################Claude 3######################## 174 | class ClaudeJudgeImage2Action: 175 | def __init__(self): 176 | raise NotImplementedError 177 | 178 | 179 | ########################Gemini######################## 180 | class GeminiJudgeImage2Action: 181 | def __init__(self): 182 | raise NotImplementedError -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |

🚀 Project Page | 🤗 HF Dataset | 📜 arXiv | 🏆 Leaderboard | 📝 blog | 🤗 HF Paper | 𝕏 Twitter

5 |

6 | 7 | --- 8 | [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow%20%40Us)](https://x.com/NiJinjie) 9 | ![Static Badge](https://img.shields.io/badge/AGI_Evaluation-dodgerblue) 10 | ![Static Badge](https://img.shields.io/badge/ICLR_2025_Spotlight-cyan) 11 | 12 |
13 |

14 | 15 |

16 | 17 | *MixEval-X encompasses eight input-output modality combinations and can be further extended. Its data points reflect real-world task distributions. The last grid presents the scores of frontier organizations’ flagship models on MixEval-X, normalized to a 0-100 scale, with MMG tasks using win rates instead of Elo. Section C of the paper presents example data samples and model responses.* 18 | 19 |
20 | 21 | # ⚡ News 22 | 23 | [2025-02-11] MixEval-X is accepted to ICLR 2025 as a Spotlight! 24 | 25 | [2024-12-10] [LMMs-Eval](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main) now supports running model evaluations on MixEval-X. You can run MixEval-X MMU tasks for your own models with one command [here](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/mix_evals)! (feel free to use your own code if you think it's easier) 26 | 27 | [2024-10-20] MixEval-X is released! Checkout the Paper and Leaderboard to learn more about this real-world any-to-any benchmark!🌟 28 | 29 |
30 | 31 | # MixEval-X 32 | 33 | **MixEval-X** is the **first** any-to-any, real-world benchmark featuring **diverse input-output modalities**, **real-world task distributions**, **consistent high standards across modalities**, and **dynamism**. It achieves up to **0.98** correlation with arena-like multi-modal evaluations while being way more efficient. 34 | 35 | > See the [project page](https://mixeval-x.github.io/) and [paper](https://arxiv.org/abs/2410.13754) for more details. 36 | 37 |
38 | 39 | # This Repo 40 | This repo contains the grading code for MixEval-X. Once you have prepared your model outputs according to the required format, you will be able to get the final scores in just a few steps. 41 | 42 | The MixEval-X data can be downloaded from the [huggingface](https://huggingface.co/datasets/MixEval/MixEval-X). 43 | 44 | **Feel free to use your own grading code, as long as it's fair.** 45 | 46 |
47 | 48 | # Quick Start 49 | **(Step 1)** Clone repo and setup the environment: 50 | ``` 51 | git clone https://github.com/Psycoy/MixEval-X.git 52 | cd MixEval-X 53 | conda create -n MixEval-X python=3.11 --yes 54 | conda activate MixEval-X 55 | bash setup.sh 56 | 57 | # setup done 58 | ``` 59 | 60 | **(Step 2)** Setup the OpenAI API key for model parser. Create `.env` file under root dir (`MixEval-X/`) and add the below line to it: 61 | ``` 62 | MODEL_PARSER_API= 63 | ``` 64 | > The values in [Leaderboard](https://mixeval-x.github.io/#leaderboard) use `GPT-3.5-Turbo-0125` as the default model parser for MMU tasks, `gpt-4o-2024-08-06` as the default model parser for agent tasks. 65 | 66 | 67 | **(Step 3)** Prepare the model outputs as specified [here](docs/model_output_formats.md) on your own side, and use the below command to compute the results. That's all! 68 | 69 | Image2Text 70 | ``` 71 | # Normal Version 72 | python -m mixeval_x.compute_metrics_mmu \ 73 | --benchmark image2text \ 74 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 75 | --models_to_eval \ 76 | gemini_1_5_pro \ 77 | gemini_1_5_flash 78 | 79 | # Hard Version 80 | python -m mixeval_x.compute_metrics_mmu \ 81 | --benchmark image2text_hard \ 82 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 83 | --models_to_eval \ 84 | gemini_1_5_pro \ 85 | gemini_1_5_flash 86 | ``` 87 | 88 | Video2Text 89 | ``` 90 | # Normal Version 91 | python -m mixeval_x.compute_metrics_mmu \ 92 | --benchmark video2text \ 93 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 94 | --models_to_eval \ 95 | gemini_1_5_pro \ 96 | gemini_1_5_flash 97 | 98 | # Hard Version 99 | python -m mixeval_x.compute_metrics_mmu \ 100 | --benchmark video2text_hard \ 101 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 102 | --models_to_eval \ 103 | gemini_1_5_pro \ 104 | gemini_1_5_flash 105 | ``` 106 | 107 | Audio2Text 108 | ``` 109 | # Normal Version 110 | python -m mixeval_x.compute_metrics_mmu \ 111 | --benchmark audio2text \ 112 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 113 | --models_to_eval \ 114 | gemini_1_5_pro \ 115 | gemini_1_5_flash 116 | 117 | # Hard Version 118 | python -m mixeval_x.compute_metrics_mmu \ 119 | --benchmark audio2text_hard \ 120 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 121 | --models_to_eval \ 122 | gemini_1_5_pro \ 123 | gemini_1_5_flash 124 | ``` 125 | 126 | Text2Action 127 | ``` 128 | python -m mixeval_x.compute_metrics_mmg_agent \ 129 | --benchmark text2action \ 130 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 131 | --judge_model "gpt-4o-2024-08-06" \ 132 | --models_to_eval \ 133 | gemini_1_5_pro \ 134 | gemini_1_5_flash 135 | ``` 136 | 137 | Image2Action 138 | ``` 139 | python -m mixeval_x.compute_metrics_mmg_agent \ 140 | --benchmark image2action \ 141 | --judge_model "gpt-4o-2024-08-06" \ 142 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 143 | --image2action_image_dir THE_PATH_TO_IMAGE2ACTION_INPUT_IMAGES \ 144 | --models_to_eval \ 145 | gemini_1_5_pro \ 146 | gemini_1_5_flash 147 | ``` 148 | 149 | Text2Image 150 | ``` 151 | python -m mixeval_x.compute_metrics_mmg_agent \ 152 | --benchmark text2image \ 153 | --judge_model "gpt-4o-2024-08-06" \ 154 | --model_response_dir THE_PATH_TO_MODEL_OUTPUT_FOLDER \ 155 | --models_to_eval \ 156 | gemini_1_5_pro \ 157 | gemini_1_5_flash 158 | ``` 159 | 160 | > The MMG results (Text2Image, Text2Video, and Text2Audio) in [Leaderboard](https://mixeval-x.github.io/#leaderboard) were graded with Amazon Mechanical Turk workers. Text2Image also supports model parsing (See Section 4.2 of the paper). However, the Text2Video and Text2Audio lack capable model judges, and thus their grading are not implemented. **You should hire human evaluators to grade these two subsets.** 161 | 162 |
163 | 164 | # Why to Use MixEval-X Benchmarks? 165 | 166 | 🥇 It extends all the benefits of MixEval to multi-modal evaluations, including comprehensive and less biased query distribution; fair grading (except open-ended tasks); dynamism; accurate model ranking; fast, cost-effective, reproducible execution; and challenging nature. 167 | 168 | 🥇 It establishes unified, high standards across modalities and communities. For single-modality models, it ensures its evaluation keeps up with the state-of-the-art standards; for multi-modality models, it ensures consistent, high-standard evaluations across modalities, preventing any from becoming a bottleneck. 169 | 170 | 🥇 Beyond model evaluation, MixEval-X benchmarks different organizations (as shown in the first Figure) with balanced dimensions (modalities), unlocking a new level of evaluation. 171 | 172 |
173 | 174 | # Acknowledgement 175 | 176 | We thank Graham Neubig, Yuntian Deng, Jason Phang, Piotr Nawrot, Luca Soldaini, Guanzhi Wang, Qinghong Lin for insightful discussions and pointers; we thank Fanyi Pu for his great effort in integrating the evaluation protocols. 177 | 178 |
179 | 180 | # 🦾 Contribute 181 | 182 | Feel free to hit the ⭐**star** button or 🦾**contribute**! We review new issues and PRs regularly and will acknowledge your contributions! 183 | 184 |
185 | 186 | # 📑 Citation 187 | 188 | If you found this repository useful, please consider 📑**citing**: 189 | 190 | ``` 191 | @article{ni2024mixevalx, 192 | title={MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures}, 193 | author={Ni, Jinjie and Song, Yifan and Ghosal, Deepanway and Li, Bo and Zhang, David Junhao and Yue, Xiang and Xue, Fuzhao and Zheng, Zian and Zhang, Kaichen and Shah, Mahir and Jain, Kabir and You, Yang and Shieh, Michael}, 194 | journal={arXiv preprint arXiv:2410.13754}, 195 | year={2024} 196 | } 197 | 198 | @article{ni2024mixeval, 199 | title={MixEval: Deriving Wisdom of the Crowd from LLM Benchmark Mixtures}, 200 | author={Ni, Jinjie and Xue, Fuzhao and Yue, Xiang and Deng, Yuntian and Shah, Mahir and Jain, Kabir and Neubig, Graham and You, Yang}, 201 | journal={arXiv preprint arXiv:2406.06565}, 202 | year={2024} 203 | } 204 | ``` -------------------------------------------------------------------------------- /mixeval_x/examples/model_response_dir/text2action/gemini_15_pro/gemini_15_pro_t2a.jsonl: -------------------------------------------------------------------------------- 1 | {"task description": "Take a photo of the Oso Nabukete cave entrance.", "allowed actions": "[Navigation], [Photograph], [InteractWithObject], [PickupObject], [PutObject], [UseObject], [ChargeDevice], [CheckBattery], [AdjustSettings], [Wait], [Speak], [Listen]", "visible objects": ", , , , , , , , , , , ", "already executed steps": "[Navigation] , [PickupObject] , [CheckBattery] ", "target": "[AdjustSettings] , [Photograph] ", "formated_input": "You are a real-world agent, and you will plan action-object sequences for the real-world tasks. You will be provided with 'Task Description', 'Allowed Actions', 'Visible Objects', and 'Already Executed Action-Object Sequences'. The 'Task Description' is a user instruction that instructs you to complete the task. The 'Allowed Actions' is a list of actions that are allowed to be used by you to complete the task. The 'visible objects' is a list of objects that are assumed to be visible to you when you are completing the task. The 'Already Executed Action-Object Sequences' is a list of action-object sequences that are assumed to have been completed by you at the moment. You need to plan the remaining action-object sequences to complete the task.\n\nBelow is a simplified example:\n\n**Start of Example**\nTask Description: Put a heated egg in the sink.\nAllowed Actions: [OpenObject], [CloseObject], [PickupObject], [PutObject], [ToggleObjectOn], [ToggleObjectOff], [SliceObject], [Navigation]\nVisible Objects: , , , , , , , , , \nAlready Executed Action-Object Sequences: [Navigation] , [OpenObject] , [PickupObject] , [CloseObject] , [Navigation] , [PutObject] \nYour Planning: [ToggleObjectOn] , [ToggleObjectOff] , [PickupObject] , [Navigation] , [PutObject] \n**End of Example**\n\nWith the above description and example, plan the remaining action-object sequences for the below task:\n\nTask Description: Take a photo of the Oso Nabukete cave entrance.\nAllowed Actions: [Navigation], [Photograph], [InteractWithObject], [PickupObject], [PutObject], [UseObject], [ChargeDevice], [CheckBattery], [AdjustSettings], [Wait], [Speak], [Listen]\nVisible Objects: , , , , , , , , , , , \nAlready Executed Action-Object Sequences: [Navigation] , [PickupObject] , [CheckBattery] \nYour Planning: \n", "id": "1", "response": "Your Planning: \n[AdjustSettings] , [UseObject] \n"} 2 | {"task description": "Disable Google Analytics tracking on the website.", "allowed actions": "[Navigate], [Click], [ReadText], [Scroll], [OpenMenu], [CloseMenu], [ToggleSwitch], [ConfirmAction], [CancelAction], [OpenPrivacySettings], [CloseApplication]", "visible objects": ", , , , , , , , , ,