├── game_agent ├── UI-Tars │ ├── mm_agents │ │ ├── __init__.py │ │ ├── gui_som │ │ │ ├── __init__.py │ │ │ ├── data_preparation │ │ │ │ ├── __init__.py │ │ │ │ └── majestic_million_download.py │ │ │ └── READAME.md │ │ ├── accessibility_tree_wrap │ │ │ ├── __init__.py │ │ │ └── relevant_retrieve.py │ │ └── README.md │ ├── desktop_env │ │ ├── __init__.py │ │ ├── controllers │ │ │ ├── __init__.py │ │ │ └── python.py │ │ └── desktop_env.py │ ├── run.bash │ ├── .env │ ├── tools │ │ ├── __init__.py │ │ ├── screenshot.py │ │ └── load_data.py │ ├── README.md │ └── lib_run_single.py ├── cradle │ ├── gpt_cua │ │ ├── agent │ │ │ └── __init__.py │ │ ├── computers │ │ │ ├── __init__.py │ │ │ ├── computer.py │ │ │ └── computer_use.py │ │ ├── __init__.py │ │ ├── run.bash │ │ ├── .env │ │ ├── log │ │ │ └── sherlock_holmes_the_tea_shop_m_You_are_an_AI_agent_specializi_20250325_202931.txt │ │ ├── utils.py │ │ └── simple_cua_loop.py │ ├── .env │ ├── claude_cua │ │ ├── __init__.py │ │ └── tools │ │ │ ├── collection.py │ │ │ ├── groups.py │ │ │ ├── __init__.py │ │ │ └── base.py │ ├── api │ │ ├── __init__.py │ │ ├── serving │ │ │ └── __init__.py │ │ └── api_caller.py │ ├── agent │ │ └── cradle │ │ │ ├── __init__.py │ │ │ ├── game_end.py │ │ │ └── info_gathering.py │ ├── run.bash │ ├── gui_grounding │ │ ├── __init__.py │ │ └── computer │ │ │ ├── computer.py │ │ │ └── computer_use.py │ ├── tools │ │ ├── __init__.py │ │ ├── screenshot.py │ │ ├── load_data.py │ │ └── utils.py │ ├── main.py │ └── README.md ├── gpt_operator │ ├── agent │ │ └── __init__.py │ ├── .env │ ├── computers │ │ ├── __init__.py │ │ └── computer.py │ ├── run.bash │ ├── README.md │ ├── utils.py │ └── simple_cua_loop.py ├── coast │ ├── gui_agent │ │ ├── gpt_cua │ │ │ ├── agent │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ ├── computers │ │ │ │ ├── __init__.py │ │ │ │ ├── computer.py │ │ │ │ └── computer_use.py │ │ │ ├── utils.py │ │ │ └── simple_cua_loop.py │ │ ├── claude_cua │ │ │ ├── __init__.py │ │ │ └── tools │ │ │ │ ├── collection.py │ │ │ │ ├── groups.py │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ ├── __init__.py │ │ ├── gui_grounding │ │ │ ├── __init__.py │ │ │ └── computer │ │ │ │ ├── computer.py │ │ │ │ └── computer_use.py │ │ └── execute.py │ ├── .env │ ├── node │ │ └── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── serving │ │ │ ├── __init__.py │ │ │ └── api_providers.py │ │ └── api_caller.py │ ├── run.bash │ ├── agent │ │ └── __init__.py │ ├── tools │ │ ├── __init__.py │ │ ├── screenshot.py │ │ └── load_data.py │ └── config.yaml ├── .env └── claude_computer_use │ ├── .env │ ├── run.bash │ ├── tools │ ├── collection.py │ ├── groups.py │ ├── __init__.py │ └── base.py │ └── README.md ├── evaluator ├── judge │ ├── vlm │ │ ├── tools │ │ │ ├── serving │ │ │ │ ├── __init__.py │ │ │ │ ├── test.py │ │ │ │ └── api_providers.py │ │ │ └── utils.py │ │ ├── __init__.py │ │ ├── api_caller.py │ │ ├── load_data.py │ │ └── screenshot.py │ └── computer_use │ │ ├── __init__.py │ │ └── tools │ │ ├── collection.py │ │ ├── groups.py │ │ ├── __init__.py │ │ └── base.py ├── run.bash ├── .env ├── screenshots │ ├── crimson_room │ │ └── milestone.png │ ├── camping_room_escape │ │ └── milestone.png │ ├── chemical_room_escape │ │ └── milestone.png │ ├── machine_room_escape │ │ ├── milestone1.png │ │ ├── milestone2.png │ │ └── milestone3.png │ ├── space_museum_escape │ │ └── milestone.png │ ├── vending_machine_room │ │ └── milestone.png │ ├── wood_workshop_escape │ │ └── milestone.png │ ├── computer_office_escape │ │ └── milestone.png │ └── geometric_room_escape │ │ └── milestone.png ├── eval_game │ ├── eval_chemical.py │ ├── eval_sort.py │ ├── eval_crimson.py │ ├── eval_camping.py │ ├── eval_wood.py │ ├── eval_computer.py │ ├── eval_geometric.py │ ├── eval_vending.py │ ├── eval_utils.py │ ├── eval_idol.py │ ├── eval_pico.py │ ├── eval_grim1.py │ ├── eval_kingdom.py │ ├── eval_college.py │ ├── eval_festival.py │ ├── eval_smalltown.py │ ├── eval_grim2.py │ ├── eval_nickbounty.py │ ├── eval_sherlock.py │ ├── eval_videostudio.py │ ├── eval_paint.py │ ├── eval_vortex2.py │ ├── eval_saucy.py │ ├── eval_design.py │ ├── eval_vortex3.py │ ├── eval_mirror.py │ ├── eval_ray2.py │ ├── eval_pierre.py │ ├── eval_dakota.py │ ├── eval_vortex.py │ ├── eval_elevator.py │ ├── eval_space.py │ ├── eval_sherlock2.py │ └── eval_gamecafe.py └── evaluate_game.py ├── assets ├── fig_coast.png ├── fig_obgap.png └── fig_keyidea.png ├── .gitignore └── LICENSE /game_agent/UI-Tars/mm_agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluator/judge/vlm/tools/serving/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/desktop_env/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/gui_som/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluator/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python evaluate_game.py -------------------------------------------------------------------------------- /game_agent/UI-Tars/desktop_env/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/accessibility_tree_wrap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/gui_som/data_preparation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run_uitars.py -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/accessibility_tree_wrap/relevant_retrieve.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent 2 | -------------------------------------------------------------------------------- /game_agent/gpt_operator/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent 2 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent 2 | -------------------------------------------------------------------------------- /evaluator/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/UI-Tars/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/coast/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/cradle/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/computers/__init__.py: -------------------------------------------------------------------------------- 1 | from .computer_use import LocalDesktopComputer 2 | -------------------------------------------------------------------------------- /assets/fig_coast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_coast.png -------------------------------------------------------------------------------- /assets/fig_obgap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_obgap.png -------------------------------------------------------------------------------- /game_agent/gpt_operator/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /assets/fig_keyidea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_keyidea.png -------------------------------------------------------------------------------- /game_agent/claude_computer_use/.env: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY="your_anthropic_key" 2 | OPENAI_API_KEY="your_openai_key" -------------------------------------------------------------------------------- /game_agent/coast/node/__init__.py: -------------------------------------------------------------------------------- 1 | from .node_inference import Planner 2 | 3 | __all__ = [ 4 | Planner 5 | ] -------------------------------------------------------------------------------- /game_agent/cradle/claude_cua/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import run_agent 2 | 3 | __all__ = [ 4 | "run_agent" 5 | ] -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/claude_cua/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | 3 | 4 | 5 | __all__ = [ 6 | "main" 7 | ] 8 | -------------------------------------------------------------------------------- /game_agent/coast/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_caller import ( 2 | api_caller 3 | ) 4 | 5 | __all__ = [ 6 | "api_caller" 7 | ] -------------------------------------------------------------------------------- /game_agent/cradle/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_caller import ( 2 | api_caller 3 | ) 4 | 5 | __all__ = [ 6 | "api_caller" 7 | ] -------------------------------------------------------------------------------- /game_agent/gpt_operator/computers/__init__.py: -------------------------------------------------------------------------------- 1 | from .computer import Computer 2 | from .computer_use import LocalDesktopComputer 3 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .execute import execute_action 2 | 3 | 4 | 5 | __all__ = [ 6 | execute_action 7 | ] 8 | -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_cua_loop import( 2 | main_gpt_cua 3 | ) 4 | 5 | __all__ = [ 6 | "main_gpt_cua" 7 | ] -------------------------------------------------------------------------------- /game_agent/cradle/agent/cradle/__init__.py: -------------------------------------------------------------------------------- 1 | from .game_agent import( 2 | run_game_agent 3 | ) 4 | 5 | __all__ = [ 6 | "run_game_agent" 7 | ] -------------------------------------------------------------------------------- /evaluator/screenshots/crimson_room/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/crimson_room/milestone.png -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_cua_loop import( 2 | main_gpt_operator 3 | ) 4 | 5 | __all__ = [ 6 | "main_gpt_operator" 7 | ] -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/computers/__init__.py: -------------------------------------------------------------------------------- 1 | from .computer_use import LocalDesktopComputer 2 | 3 | 4 | __all__ = [ 5 | "LocalDesktopComputer" 6 | ] -------------------------------------------------------------------------------- /evaluator/screenshots/camping_room_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/camping_room_escape/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/chemical_room_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/chemical_room_escape/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/machine_room_escape/milestone1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone1.png -------------------------------------------------------------------------------- /evaluator/screenshots/machine_room_escape/milestone2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone2.png -------------------------------------------------------------------------------- /evaluator/screenshots/machine_room_escape/milestone3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone3.png -------------------------------------------------------------------------------- /evaluator/screenshots/space_museum_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/space_museum_escape/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/vending_machine_room/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/vending_machine_room/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/wood_workshop_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/wood_workshop_escape/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/computer_office_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/computer_office_escape/milestone.png -------------------------------------------------------------------------------- /evaluator/screenshots/geometric_room_escape/milestone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/geometric_room_escape/milestone.png -------------------------------------------------------------------------------- /game_agent/coast/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | SCRIPT_NAME="game_agent.py" 5 | CONFIG_FILE="config.yaml" 6 | 7 | 8 | python "$SCRIPT_NAME" --config "$CONFIG_FILE"; then 9 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/gui_som/READAME.md: -------------------------------------------------------------------------------- 1 | Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available. -------------------------------------------------------------------------------- /game_agent/coast/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent 2 | 3 | from .moduler import ( 4 | SeekerBot, 5 | SolverBot, 6 | MapperBot 7 | ) 8 | 9 | __all__ = [ 10 | Agent, 11 | SeekerBot, 12 | SolverBot, 13 | MapperBot 14 | ] 15 | -------------------------------------------------------------------------------- /game_agent/coast/api/serving/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_providers import ( 2 | openai_completion, 3 | anthropic_completion, 4 | gemini_completion 5 | ) 6 | 7 | __all__ = [ 8 | "openai_completion", 9 | "anthropic_completion", 10 | "gemini_completion" 11 | ] -------------------------------------------------------------------------------- /game_agent/cradle/api/serving/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_providers import ( 2 | openai_completion, 3 | anthropic_completion, 4 | gemini_completion 5 | ) 6 | 7 | __all__ = [ 8 | "openai_completion", 9 | "anthropic_completion", 10 | "gemini_completion" 11 | ] -------------------------------------------------------------------------------- /game_agent/cradle/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MODEL="claude-3-7-sonnet-20250219" 4 | PROVIDER="anthropic" 5 | CUA="claude" 6 | 7 | ## other_options 8 | # MODEL="gpt-4o" 9 | # PROVIDER="openai" 10 | # CUA="gpt" 11 | # CUA="sonnet" 12 | # CUA="uground" 13 | 14 | ## Execute 15 | python main.py --model "$MODEL" --provider "$PROVIDER" --cua "$CUA" -------------------------------------------------------------------------------- /game_agent/cradle/gui_grounding/__init__.py: -------------------------------------------------------------------------------- 1 | from .computer.computer_use import( 2 | LocalDesktopComputer 3 | ) 4 | 5 | from .uground import( 6 | agent_step 7 | ) 8 | from .claude import( 9 | run_claude_gui_agent 10 | ) 11 | 12 | 13 | __all__ = [ 14 | "LocalDesktopComputer", 15 | "agent_step", 16 | "run_claude_gui_agent" 17 | ] -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gui_grounding/__init__.py: -------------------------------------------------------------------------------- 1 | from .computer.computer_use import( 2 | LocalDesktopComputer 3 | ) 4 | 5 | from .uground import( 6 | agent_step 7 | ) 8 | from .claude import( 9 | run_claude_gui_agent 10 | ) 11 | 12 | 13 | __all__ = [ 14 | "LocalDesktopComputer", 15 | "agent_step", 16 | "run_claude_gui_agent" 17 | ] -------------------------------------------------------------------------------- /game_agent/claude_computer_use/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=$(pwd)/.. 4 | 5 | # Choose task by number (1-based index from tasks.json) 6 | 7 | TASK_FILE="./json/game_prompt.json" 8 | PROMPT_TYPE="prompt" 9 | MAX_ACTIONS=1000 10 | 11 | python main.py \ 12 | --task-file "$TASK_FILE" \ 13 | --prompt-type "$PROMPT_TYPE" \ 14 | --max-actions "$MAX_ACTIONS" -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHON=python 4 | MAIN_SCRIPT="main.py" 5 | DEFAULT_PROMPT_FILE="tasks.json" 6 | 7 | echo "🚀 Agent 자동화 루프 시작 중..." 8 | 9 | # prompt.json 파일이 존재하면 자동으로 전달 10 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then 11 | echo "📝 $DEFAULT_PROMPT_FILE 감지됨, 자동으로 프롬프트 사용" 12 | $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE" 13 | else 14 | echo "📄 $DEFAULT_PROMPT_FILE 없음, 기본 프롬프트로 실행" 15 | $PYTHON $MAIN_SCRIPT 16 | fi -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY="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" 2 | # ANTHROPIC_API_KEY="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" 3 | # GEMINI_API_KEY="sk-zzzzzzzzzzzzzzzzzzzzzz" 4 | 5 | 6 | # OPENAI_API_KEY="sk-svcacct-GGftbt0z90DZrqmTZ6O8M17wtdQtTHDuxTQIdNec1diEUQSc-DPmSwDi6Zmx4S3EYIYT3BlbkFJhzY7qIGmeQJVIL2GMldI8mFYtwg8vWKOEvoOgjL9SKWXZbZhjsNW3N0u3db-CQJX4JwA" -------------------------------------------------------------------------------- /evaluator/judge/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | __version__ = "0.1.0" 7 | __author__ = "FlashAdventure" 8 | 9 | """ 10 | Autonomous Computer Use Agent with Claude Sonnet 11 | """ 12 | 13 | from .evaluator import ( 14 | main, 15 | ) 16 | 17 | from .load_data import( 18 | load_game_prompt_eval, 19 | ) 20 | 21 | 22 | 23 | __all__ = [ 24 | "main", 25 | "load_game_prompt_eval", 26 | 27 | ] 28 | -------------------------------------------------------------------------------- /game_agent/gpt_operator/run.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHON=python 4 | MAIN_SCRIPT="main.py" 5 | DEFAULT_PROMPT_FILE="game_prompts.json" 6 | HISTORY=10 7 | 8 | echo "🚀 Starting agent automation loop..." 9 | 10 | # Automatically pass the prompt.json file if it exists 11 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then 12 | echo "📝 $DEFAULT_PROMPT_FILE detected, using prompt automatically" 13 | $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE" --history $HISTORY 14 | else 15 | echo "📄 $DEFAULT_PROMPT_FILE not found, running with default prompt" 16 | $PYTHON $MAIN_SCRIPT --history $HISTORY 17 | fi -------------------------------------------------------------------------------- /game_agent/cradle/agent/cradle/game_end.py: -------------------------------------------------------------------------------- 1 | from api import api_caller 2 | 3 | def game_end(system_prompt, screen, api_provider, model_name): 4 | prompt = f""" 5 | Current Screen:\n 6 | [Image]\n\n 7 | 8 | Please check whether the current screen indicates that the game has been completely and successfully cleared.\n 9 | If successful, the player either escapes the room or sees a message indicating the game has been completed.\n 10 | If it has, output [Done].\n 11 | """ 12 | 13 | response = api_caller(api_provider, system_prompt, model_name, prompt, screen) 14 | 15 | return response -------------------------------------------------------------------------------- /game_agent/UI-Tars/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_data import ( 2 | save_chat_log, 3 | load_game_prompt, 4 | load_system_prompt 5 | ) 6 | 7 | from .screenshot import ( 8 | capture_flash_screenshot 9 | ) 10 | 11 | from .utils import ( 12 | encode_images_to_base64, 13 | encode_image, 14 | extract_python_code, 15 | extract_action_change 16 | ) 17 | 18 | __all__ = [ 19 | "save_chat_log", 20 | "load_game_prompt", 21 | "load_game_prompt_eval", 22 | "capture_flash_screenshot", 23 | "encode_image", 24 | "extract_python_code", 25 | "extract_action_change", 26 | "encode_images_to_base64", 27 | "load_system_prompt" 28 | ] -------------------------------------------------------------------------------- /game_agent/cradle/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_data import ( 2 | save_chat_log, 3 | load_game_prompt, 4 | load_system_prompt 5 | ) 6 | 7 | from .screenshot import ( 8 | capture_flash_screenshot 9 | ) 10 | 11 | from .utils import ( 12 | encode_images_to_base64, 13 | encode_image, 14 | extract_python_code, 15 | extract_action_change 16 | ) 17 | 18 | __all__ = [ 19 | "save_chat_log", 20 | "load_game_prompt", 21 | "load_game_prompt_eval", 22 | "capture_flash_screenshot", 23 | "encode_image", 24 | "extract_python_code", 25 | "extract_action_change", 26 | "encode_images_to_base64", 27 | "load_system_prompt" 28 | ] -------------------------------------------------------------------------------- /evaluator/eval_game/eval_chemical.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | ## Configuration Dictionary 6 | config = { 7 | "game": "Chemical Room Escape", 8 | "api_provider": "anthropic", 9 | "model_name": "claude-3-7-sonnet-20250219", 10 | "loop_interval": 3 11 | } 12 | 13 | 14 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 15 | 16 | 17 | config["system_prompt"] = system_prompt 18 | config["evaluation_prompt"] = evaluation_prompt 19 | config["example_image_path"] = example_image_path 20 | 21 | ## Running 22 | evaluator_none_cua(**config) 23 | 24 | 25 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_sort.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | ## Configuration Dictionary 6 | 7 | config = { 8 | "game": "Sort the Court", 9 | "api_provider": "anthropic", 10 | "model_name": "claude-3-7-sonnet-20250219", 11 | "loop_interval": 3 12 | } 13 | 14 | 15 | 16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 17 | 18 | config["system_prompt"] = system_prompt 19 | config["evaluation_prompt"] = evaluation_prompt 20 | config["example_image_path"] = example_image_path 21 | 22 | 23 | ## Running 24 | evaluator_none_cua(**config) 25 | 26 | 27 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_crimson.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | ## Configuration Dictionary 6 | 7 | 8 | config = { 9 | "game": "Crimson Room", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 16 | 17 | 18 | config["system_prompt"] = system_prompt 19 | config["evaluation_prompt"] = evaluation_prompt 20 | config["example_image_path"] = example_image_path 21 | 22 | 23 | ## Running 24 | evaluator_none_cua(**config) 25 | 26 | 27 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_camping.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | 6 | ## Configuration Dictionary 7 | 8 | config = { 9 | "game": "Camping Room Escape", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 16 | 17 | 18 | config["system_prompt"] = system_prompt 19 | config["evaluation_prompt"] = evaluation_prompt 20 | config["example_image_path"] = example_image_path 21 | 22 | 23 | ## Running 24 | evaluator_none_cua(**config) 25 | 26 | 27 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_wood.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | 6 | ## Configuration Dictionary 7 | 8 | config = { 9 | "game": "Wood Workshop Escape", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 16 | 17 | 18 | config["system_prompt"] = system_prompt 19 | config["evaluation_prompt"] = evaluation_prompt 20 | config["example_image_path"] = example_image_path 21 | 22 | 23 | ## Running 24 | evaluator_none_cua(**config) 25 | 26 | 27 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_computer.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | 6 | ## Configuration Dictionary 7 | 8 | config = { 9 | "game": "Computer Office Escape", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 16 | 17 | 18 | config["system_prompt"] = system_prompt 19 | config["evaluation_prompt"] = evaluation_prompt 20 | config["example_image_path"] = example_image_path 21 | 22 | 23 | ## Running 24 | evaluator_none_cua(**config) 25 | 26 | 27 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_geometric.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | 6 | ## Configuration Dictionary 7 | 8 | config = { 9 | "game": "Geometric Room Escape", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | 16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 17 | 18 | 19 | config["system_prompt"] = system_prompt 20 | config["evaluation_prompt"] = evaluation_prompt 21 | config["example_image_path"] = example_image_path 22 | 23 | 24 | ## Running 25 | evaluator_none_cua(**config) 26 | 27 | 28 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_vending.py: -------------------------------------------------------------------------------- 1 | from judge.vlm import main as evaluator_none_cua 2 | from judge.vlm import load_game_prompt_eval 3 | 4 | 5 | 6 | ## Configuration Dictionary 7 | 8 | config = { 9 | "game": "Vending Machine Room Escape", 10 | "api_provider": "anthropic", 11 | "model_name": "claude-3-7-sonnet-20250219", 12 | "loop_interval": 3 13 | } 14 | 15 | 16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"]) 17 | 18 | 19 | config["system_prompt"] = system_prompt 20 | config["evaluation_prompt"] = evaluation_prompt 21 | config["example_image_path"] = example_image_path 22 | 23 | 24 | ## Running 25 | evaluator_none_cua(**config) 26 | 27 | 28 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import re 4 | import asyncio 5 | from pathlib import Path 6 | from judge.computer_use.main import main as run_main 7 | 8 | TEMP_TASK_FILE = "temp_task.json" 9 | 10 | async def run_milestone(prompt_text: str, task_name: str) -> list[str]: 11 | task_data = { 12 | task_name: { 13 | "milestone_prompt": prompt_text 14 | } 15 | } 16 | with open(TEMP_TASK_FILE, "w") as f: 17 | json.dump(task_data, f) 18 | 19 | sys.argv = [ 20 | "main.py", 21 | "--task-file", TEMP_TASK_FILE, 22 | "--task-name", task_name, 23 | "--prompt-type", "milestone_prompt" 24 | ] 25 | 26 | return await run_main() # returns conversation -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/log/sherlock_holmes_the_tea_shop_m_You_are_an_AI_agent_specializi_20250325_202931.txt: -------------------------------------------------------------------------------- 1 | USER: You are an AI agent specializing in the Sherlock Holmes detective game. Your task is to locate and click the blue notebook icon in the game interface (usually found at the far right of the inventory bar). Once the notebook is opened, do not perform any additional actions. Carefully read the displayed text (do not scroll) and count how many times the phrase 'New Suspect' appears in the note. Once counted, output the result in the following format: ### Output Format #### New Suspect: [Number of occurrences]Do not perform any further interactions after counting. Your task ends once the count is provided. 2 | 3 | ASSISTANT: New Suspect: 1 4 | 5 | USER: 6 | 7 | -------------------------------------------------------------------------------- /evaluator/judge/vlm/api_caller.py: -------------------------------------------------------------------------------- 1 | from judge.vlm.tools.serving.api_providers import anthropic_completion, openai_completion, gemini_completion 2 | 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_image=None, base64_image2=None): 4 | base64_images = [img for img in [base64_image, base64_image2] if img] 5 | 6 | if api_provider == "anthropic": 7 | response = anthropic_completion(system_prompt, model_name, base64_images, move_prompts) 8 | elif api_provider == "openai": 9 | response = openai_completion(system_prompt, model_name, base64_images, move_prompts) 10 | elif api_provider == "gemini": 11 | response = gemini_completion(system_prompt, model_name, base64_images, move_prompts) 12 | else: 13 | raise NotImplementedError(f"API provider '{api_provider}' is not supported.") 14 | 15 | return response 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 💾 Cache and Temporary Files 2 | cache/* 3 | __pycache__/ 4 | */__pycache__/ 5 | *.pyc 6 | *.pyo 7 | *.pyd 8 | *.mp4 9 | *.sh 10 | 11 | # 🔧 Compiled Binaries and Object Files 12 | *.o 13 | *.so 14 | *.dll 15 | *.dylib 16 | *.exe 17 | *.out 18 | *.a 19 | *.lib 20 | *.obj 21 | 22 | # 📄 Logs and Debug Files 23 | *.log 24 | *.tmp 25 | *.swp 26 | *.swo 27 | 28 | 29 | # 📌 Build and Dependency Directories 30 | bin/ 31 | build/ 32 | dist/ 33 | *.egg-info/ 34 | *.manifest 35 | *.spec 36 | 37 | # 🏗️ Make and CMake Files 38 | CMakeFiles/ 39 | CMakeCache.txt 40 | Makefile 41 | cmake_install.cmake 42 | 43 | # 🚀 Virtual Environment (Python) 44 | venv/ 45 | .venv/ 46 | env/ 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # 📝 IDE and Editor Specific Files 51 | .vscode/ 52 | .idea/ 53 | *.iml 54 | 55 | # 🔒 OS-Specific Files 56 | .DS_Store 57 | Thumbs.db 58 | desktop.ini 59 | 60 | # 🗂️ Past Code and Logs 61 | past/ 62 | logs/ -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/computers/computer.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, List, Literal, Dict 2 | 3 | 4 | class Computer(Protocol): 5 | """Defines the 'shape' (methods/properties) our loop expects.""" 6 | 7 | @property 8 | def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... 9 | @property 10 | def dimensions(self) -> tuple[int, int]: ... 11 | 12 | def screenshot(self) -> str: ... 13 | 14 | def click(self, x: int, y: int, button: str = "left") -> None: ... 15 | 16 | def double_click(self, x: int, y: int) -> None: ... 17 | 18 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... 19 | 20 | def type(self, text: str) -> None: ... 21 | 22 | def wait(self, ms: int = 1000) -> None: ... 23 | 24 | def move(self, x: int, y: int) -> None: ... 25 | 26 | def keypress(self, keys: List[str]) -> None: ... 27 | 28 | def drag(self, path: List[Dict[str, int]]) -> None: ... 29 | 30 | def get_current_url() -> str: ... 31 | -------------------------------------------------------------------------------- /game_agent/gpt_operator/computers/computer.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, List, Literal, Dict 2 | 3 | 4 | class Computer(Protocol): 5 | """Defines the 'shape' (methods/properties) our loop expects.""" 6 | 7 | @property 8 | def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... 9 | @property 10 | def dimensions(self) -> tuple[int, int]: ... 11 | 12 | def screenshot(self) -> str: ... 13 | 14 | def click(self, x: int, y: int, button: str = "left") -> None: ... 15 | 16 | def double_click(self, x: int, y: int) -> None: ... 17 | 18 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... 19 | 20 | def type(self, text: str) -> None: ... 21 | 22 | def wait(self, ms: int = 1000) -> None: ... 23 | 24 | def move(self, x: int, y: int) -> None: ... 25 | 26 | def keypress(self, keys: List[str]) -> None: ... 27 | 28 | def drag(self, path: List[Dict[str, int]]) -> None: ... 29 | 30 | def get_current_url() -> str: ... 31 | -------------------------------------------------------------------------------- /evaluator/judge/computer_use/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | __version__ = "0.1.0" 7 | __author__ = "FlashAdventure" 8 | 9 | """ 10 | Autonomous Computer Use Agent with Claude Sonnet 11 | """ 12 | 13 | from .app import ( 14 | get_screen_details, 15 | load_api_key, 16 | load_tasks, 17 | get_task_prompt, 18 | api_response_callback, 19 | tool_output_callback, 20 | message_callback, 21 | run_agent, 22 | main, 23 | ) 24 | 25 | # loop 모듈 26 | from .loop import ( 27 | APIProvider, 28 | sampling_loop, 29 | ) 30 | 31 | __all__ = [ 32 | "get_screen_details", 33 | "load_api_key", 34 | "load_tasks", 35 | "get_task_prompt", 36 | "api_response_callback", 37 | "tool_output_callback", 38 | "message_callback", 39 | "run_agent", 40 | "main", 41 | "APIProvider", 42 | "sampling_loop", 43 | ] 44 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/computers/computer.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol, List, Literal, Dict 2 | 3 | 4 | class Computer(Protocol): 5 | """Defines the 'shape' (methods/properties) our loop expects.""" 6 | 7 | @property 8 | def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... 9 | @property 10 | def dimensions(self) -> tuple[int, int]: ... 11 | 12 | def screenshot(self) -> str: ... 13 | 14 | def click(self, x: int, y: int, button: str = "left") -> None: ... 15 | 16 | def double_click(self, x: int, y: int) -> None: ... 17 | 18 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... 19 | 20 | def type(self, text: str) -> None: ... 21 | 22 | def wait(self, ms: int = 1000) -> None: ... 23 | 24 | def move(self, x: int, y: int) -> None: ... 25 | 26 | def keypress(self, keys: List[str]) -> None: ... 27 | 28 | def drag(self, path: List[Dict[str, int]]) -> None: ... 29 | 30 | def get_current_url() -> str: ... 31 | -------------------------------------------------------------------------------- /evaluator/judge/vlm/tools/serving/test.py: -------------------------------------------------------------------------------- 1 | import anthropic 2 | import dotenv 3 | import os 4 | import anthropic 5 | 6 | # .env 파일 로드 7 | dotenv.load_dotenv() 8 | 9 | client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) 10 | 11 | response = client.beta.messages.create( 12 | model="claude-3-7-sonnet-20250219", 13 | max_tokens=1024, 14 | tools=[ 15 | { 16 | "type": "computer_20250124", 17 | "name": "computer", 18 | "display_width_px": 1024, 19 | "display_height_px": 768, 20 | "display_number": 1, 21 | }, 22 | { 23 | "type": "text_editor_20241022", 24 | "name": "str_replace_editor" 25 | }, 26 | { 27 | "type": "bash_20241022", 28 | "name": "bash" 29 | } 30 | ], 31 | messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}], 32 | betas=["computer-use-2025-01-24"], 33 | thinking={"type": "enabled", "budget_tokens": 1024} 34 | ) 35 | print(response) 36 | -------------------------------------------------------------------------------- /evaluator/judge/computer_use/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /game_agent/claude_computer_use/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /game_agent/cradle/claude_cua/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/claude_cua/tools/collection.py: -------------------------------------------------------------------------------- 1 | """Collection classes for managing multiple tools.""" 2 | 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | from .base import ( 8 | BaseAnthropicTool, 9 | ToolError, 10 | ToolFailure, 11 | ToolResult, 12 | ) 13 | 14 | 15 | class ToolCollection: 16 | """A collection of anthropic-defined tools.""" 17 | 18 | def __init__(self, *tools: BaseAnthropicTool): 19 | self.tools = tools 20 | self.tool_map = {tool.to_params()["name"]: tool for tool in tools} 21 | 22 | def to_params( 23 | self, 24 | ) -> list[BetaToolUnionParam]: 25 | return [tool.to_params() for tool in self.tools] 26 | 27 | async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult: 28 | tool = self.tool_map.get(name) 29 | if not tool: 30 | return ToolFailure(error=f"Tool {name} is invalid") 31 | try: 32 | return await tool(**tool_input) 33 | except ToolError as e: 34 | return ToolFailure(error=e.message) 35 | -------------------------------------------------------------------------------- /game_agent/coast/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_data import ( 2 | save_chat_log, 3 | load_game_prompt, 4 | load_config, 5 | load_memory, 6 | load_action_prompt 7 | ) 8 | 9 | from .screenshot import ( 10 | capture_flash_screenshot 11 | ) 12 | 13 | from .utils import ( 14 | encode_images_to_base64, 15 | encode_image, 16 | extract_python_code, 17 | extract_action_change, 18 | append_to_json_list, 19 | extract_clues_from_text, 20 | extract_episodic_memory_from_text, 21 | extract_json_block_from_response 22 | 23 | ) 24 | 25 | __all__ = [ 26 | "save_chat_log", 27 | "load_game_prompt", 28 | "load_game_prompt_eval", 29 | "capture_flash_screenshot", 30 | "encode_image", 31 | "extract_python_code", 32 | "extract_action_change", 33 | "encode_images_to_base64", 34 | "load_system_prompt", 35 | "load_config", 36 | "load_action_prompt", 37 | "load_memory", 38 | "append_to_json_list", 39 | "extract_json_from_messages", 40 | "extract_clues_from_text", 41 | "extract_episodic_memory_from_text", 42 | "extract_json_block_from_response" 43 | ] -------------------------------------------------------------------------------- /game_agent/coast/config.yaml: -------------------------------------------------------------------------------- 1 | # If even one action succeeds, stop the entire tree traversal and restart from the root task 2 | RESET_ON_SUCCESS: true 3 | max_action_count: 1000 4 | 5 | # Prompt path 6 | 7 | action_prompt_path: "./json/action_prompt.json" 8 | game_prompt_path: "./json/game_prompt.json" 9 | 10 | 11 | 12 | # Model settings 13 | # api_provider: "openai" 14 | # reasoning_model: "gpt-4o" 15 | # gui_model: "gpt_operator" # or gpt_operator / claude_cua / uground 16 | 17 | api_provider: "anthropic" 18 | reasoning_model: "claude-3-7-sonnet-20250219" 19 | gui_model: "claude_cua" # or gpt_operator / claude_cua / uground 20 | 21 | # # Tree expansion settings 22 | # width: 3 # Number of subtasks to generate in one step 23 | # depth: 3 # Total search depth 24 | 25 | # Grounding resolution settings (used in UGround, etc.) 26 | grounding_width: 1366 27 | grounding_height: 768 28 | 29 | # Other options (optional) 30 | timeout: 30 # API timeout in seconds 31 | max_steps: 100 # Maximum number of execution steps 32 | 33 | 34 | ### Clue_Solver Max Action: 35 | max_actions_solver: 5 36 | ### Clue_Seeker Max Action: 37 | max_actions_seeker: 15 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Jaewoo Ahn, Junseo Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/gui_som/data_preparation/majestic_million_download.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | import requests 4 | 5 | 6 | # Latest run on 2024.1.4 7 | def download_csv(url, file_path): 8 | response = requests.get(url) 9 | with open(file_path, 'w', newline='', encoding='utf-8') as file: 10 | file.write(response.text) 11 | 12 | 13 | def read_csv(file_path): 14 | urls = [] 15 | with open(file_path, newline='', encoding='utf-8') as csvfile: 16 | reader = csv.reader(csvfile) 17 | next(reader, None) # Skip the header 18 | for row in reader: 19 | urls.append(row[2]) # Assuming the URL is in the third column 20 | return urls 21 | 22 | 23 | def main(): 24 | url = 'http://downloads.majestic.com/majestic_million.csv' 25 | file_path = 'majestic_million.csv' 26 | 27 | print("Downloading Majestic Million CSV...") 28 | download_csv(url, file_path) 29 | 30 | print("Reading URLs from CSV...") 31 | urls = read_csv(file_path) 32 | 33 | # Print the first 10 URLs as a sample 34 | for url in urls[:10]: 35 | print(url) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /evaluator/judge/vlm/load_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | def save_chat_log(entry, LOG_FILE): 5 | """ Save game move log to a JSON file """ 6 | logs = [] 7 | if os.path.exists(LOG_FILE): 8 | with open(LOG_FILE, "r") as f: 9 | try: 10 | logs = json.load(f) 11 | except json.JSONDecodeError: 12 | logs = [] 13 | 14 | logs.append(entry) 15 | with open(LOG_FILE, "w") as f: 16 | json.dump(logs, f, indent=4) 17 | 18 | 19 | def load_game_prompt_eval(game_name, image_num = 1): 20 | json_path = "./milestone_prompts.json" 21 | """ Load prompt and control keys for a specific game from JSON """ 22 | with open(json_path, "r") as f: 23 | game_data = json.load(f) 24 | 25 | if game_name in game_data: 26 | prompt = game_data[game_name].get("prompt") 27 | eval_prompt = game_data[game_name].get("evaluation_prompt") 28 | example_image_path = game_data[game_name].get(f"example_image_path{image_num}", None) 29 | 30 | return prompt, eval_prompt, example_image_path 31 | else: 32 | raise ValueError(f"No prompt found for game '{game_name}'.") -------------------------------------------------------------------------------- /evaluator/judge/computer_use/tools/groups.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from .base import BaseAnthropicTool 5 | from .bash import ShellTool20241022, ShellTool20250124 6 | from .computer import ComputerTool20241022, ComputerTool20250124 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 8 | 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"] 10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"] 11 | 12 | 13 | @dataclass(frozen=True, kw_only=True) 14 | class ToolGroup: 15 | version: ToolVersion 16 | tools: list[type[BaseAnthropicTool]] 17 | beta_flag: BetaFlag | None = None 18 | 19 | 20 | TOOL_GROUPS: list[ToolGroup] = [ 21 | ToolGroup( 22 | version="computer_use_20241022", 23 | tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022], 24 | beta_flag="computer-use-2024-10-22", 25 | ), 26 | ToolGroup( 27 | version="computer_use_20250124", 28 | tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124], 29 | beta_flag="computer-use-2025-01-24", 30 | ), 31 | ] 32 | 33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS} -------------------------------------------------------------------------------- /game_agent/claude_computer_use/tools/groups.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from .base import BaseAnthropicTool 5 | from .bash import ShellTool20241022, ShellTool20250124 6 | from .computer import ComputerTool20241022, ComputerTool20250124 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 8 | 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"] 10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"] 11 | 12 | 13 | @dataclass(frozen=True, kw_only=True) 14 | class ToolGroup: 15 | version: ToolVersion 16 | tools: list[type[BaseAnthropicTool]] 17 | beta_flag: BetaFlag | None = None 18 | 19 | 20 | TOOL_GROUPS: list[ToolGroup] = [ 21 | ToolGroup( 22 | version="computer_use_20241022", 23 | tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022], 24 | beta_flag="computer-use-2024-10-22", 25 | ), 26 | ToolGroup( 27 | version="computer_use_20250124", 28 | tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124], 29 | beta_flag="computer-use-2025-01-24", 30 | ), 31 | ] 32 | 33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS} -------------------------------------------------------------------------------- /game_agent/cradle/claude_cua/tools/groups.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from .base import BaseAnthropicTool 5 | from .bash import ShellTool20241022, ShellTool20250124 6 | from .computer import ComputerTool20241022, ComputerTool20250124 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 8 | 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"] 10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"] 11 | 12 | 13 | @dataclass(frozen=True, kw_only=True) 14 | class ToolGroup: 15 | version: ToolVersion 16 | tools: list[type[BaseAnthropicTool]] 17 | beta_flag: BetaFlag | None = None 18 | 19 | 20 | TOOL_GROUPS: list[ToolGroup] = [ 21 | ToolGroup( 22 | version="computer_use_20241022", 23 | tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022], 24 | beta_flag="computer-use-2024-10-22", 25 | ), 26 | ToolGroup( 27 | version="computer_use_20250124", 28 | tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124], 29 | beta_flag="computer-use-2025-01-24", 30 | ), 31 | ] 32 | 33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS} -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/claude_cua/tools/groups.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Literal 3 | 4 | from .base import BaseAnthropicTool 5 | from .bash import ShellTool20241022, ShellTool20250124 6 | from .computer import ComputerTool20241022, ComputerTool20250124 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 8 | 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"] 10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"] 11 | 12 | 13 | @dataclass(frozen=True, kw_only=True) 14 | class ToolGroup: 15 | version: ToolVersion 16 | tools: list[type[BaseAnthropicTool]] 17 | beta_flag: BetaFlag | None = None 18 | 19 | 20 | TOOL_GROUPS: list[ToolGroup] = [ 21 | ToolGroup( 22 | version="computer_use_20241022", 23 | tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022], 24 | beta_flag="computer-use-2024-10-22", 25 | ), 26 | ToolGroup( 27 | version="computer_use_20250124", 28 | tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124], 29 | beta_flag="computer-use-2025-01-24", 30 | ), 31 | ] 32 | 33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS} -------------------------------------------------------------------------------- /game_agent/cradle/gui_grounding/computer/computer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Dict, Literal 3 | 4 | class Computer(ABC): 5 | @property 6 | @abstractmethod 7 | def environment(self) -> Literal["windows", "mac", "linux"]: 8 | pass 9 | 10 | @property 11 | @abstractmethod 12 | def dimensions(self) -> tuple[int, int]: 13 | pass 14 | 15 | @abstractmethod 16 | def screenshot(self) -> str: 17 | pass 18 | 19 | @abstractmethod 20 | def click(self, x: int, y: int, button: str = "left") -> None: 21 | pass 22 | 23 | @abstractmethod 24 | def double_click(self, x: int, y: int) -> None: 25 | pass 26 | 27 | @abstractmethod 28 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 29 | pass 30 | 31 | @abstractmethod 32 | def type(self, text: str) -> None: 33 | pass 34 | 35 | @abstractmethod 36 | def wait(self, ms: int = 1000) -> None: 37 | pass 38 | 39 | @abstractmethod 40 | def move(self, x: int, y: int) -> None: 41 | pass 42 | 43 | @abstractmethod 44 | def keypress(self, keys: List[str]) -> None: 45 | pass 46 | 47 | @abstractmethod 48 | def drag(self, path: List[Dict[str, int]]) -> None: 49 | pass 50 | 51 | @abstractmethod 52 | def get_current_url(self) -> str: 53 | pass -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gui_grounding/computer/computer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Dict, Literal 3 | 4 | class Computer(ABC): 5 | @property 6 | @abstractmethod 7 | def environment(self) -> Literal["windows", "mac", "linux"]: 8 | pass 9 | 10 | @property 11 | @abstractmethod 12 | def dimensions(self) -> tuple[int, int]: 13 | pass 14 | 15 | @abstractmethod 16 | def screenshot(self) -> str: 17 | pass 18 | 19 | @abstractmethod 20 | def click(self, x: int, y: int, button: str = "left") -> None: 21 | pass 22 | 23 | @abstractmethod 24 | def double_click(self, x: int, y: int) -> None: 25 | pass 26 | 27 | @abstractmethod 28 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 29 | pass 30 | 31 | @abstractmethod 32 | def type(self, text: str) -> None: 33 | pass 34 | 35 | @abstractmethod 36 | def wait(self, ms: int = 1000) -> None: 37 | pass 38 | 39 | @abstractmethod 40 | def move(self, x: int, y: int) -> None: 41 | pass 42 | 43 | @abstractmethod 44 | def keypress(self, keys: List[str]) -> None: 45 | pass 46 | 47 | @abstractmethod 48 | def drag(self, path: List[Dict[str, int]]) -> None: 49 | pass 50 | 51 | @abstractmethod 52 | def get_current_url(self) -> str: 53 | pass -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/execute.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from gui_agent.gpt_cua import main_gpt_operator 3 | from gui_agent.claude_cua import main as main_claude_cua 4 | from gui_agent.gui_grounding import agent_step as main_uground 5 | from gui_agent.gui_grounding import run_claude_gui_agent as main_claude_sonnet 6 | 7 | def execute_action(action_prompt, system_prompt=None, encoded_image=None, gui_model="gpt_operator", reasoning_model="gpt-4o", type=None): 8 | if gui_model == "gpt_operator": 9 | return main_gpt_operator( 10 | user_prompt=action_prompt 11 | ) 12 | 13 | elif gui_model == "claude_cua": 14 | return asyncio.run(main_claude_cua( 15 | user_prompt=action_prompt, 16 | system_prompt=system_prompt, 17 | type=type 18 | )) 19 | 20 | elif gui_model == "uground": 21 | api_provider = "openai" if reasoning_model == "gpt-4o" else "anthropic" 22 | return asyncio.run(main_uground( 23 | user_prompt=action_prompt, 24 | encoded_image=encoded_image, 25 | provider=api_provider, 26 | model=reasoning_model 27 | )) 28 | 29 | elif gui_model == "claude_sonnet": 30 | return asyncio.run(main_claude_sonnet( 31 | user_prompt=action_prompt, 32 | encoded_image=encoded_image 33 | )) 34 | else: 35 | print(f"[ERROR] Unknown gui_model: {gui_model}") 36 | return 0 -------------------------------------------------------------------------------- /evaluator/judge/computer_use/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult 7 | from .collection import ToolCollection 8 | from .bash import ShellTool20241022, ShellTool20250124 9 | from .computer import ComputerTool20241022, ComputerTool20250124 10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag 12 | 13 | __all__ = [ 14 | "BaseAnthropicTool", 15 | "ToolResult", 16 | "ToolError", 17 | "ToolFailure", 18 | "ToolCollection", 19 | "ShellTool20241022", 20 | "ShellTool20250124", 21 | "ComputerTool20241022", 22 | "ComputerTool20250124", 23 | "CrossPlatformEditTool20241022", 24 | "CrossPlatformEditTool20250124", 25 | "TOOL_GROUPS", 26 | "TOOL_GROUPS_BY_VERSION", 27 | "ToolVersion", 28 | "BetaFlag", 29 | ] 30 | 31 | # Aliases for backwards compatibility 32 | from .bash import ShellTool20250124 as BashTool20250124 33 | from .bash import ShellTool20241022 as BashTool20241022 34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124 35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022 36 | 37 | __all__ += [ 38 | "BashTool20250124", 39 | "BashTool20241022", 40 | "EditTool20250124", 41 | "EditTool20241022", 42 | ] -------------------------------------------------------------------------------- /game_agent/claude_computer_use/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult 7 | from .collection import ToolCollection 8 | from .bash import ShellTool20241022, ShellTool20250124 9 | from .computer import ComputerTool20241022, ComputerTool20250124 10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag 12 | 13 | __all__ = [ 14 | "BaseAnthropicTool", 15 | "ToolResult", 16 | "ToolError", 17 | "ToolFailure", 18 | "ToolCollection", 19 | "ShellTool20241022", 20 | "ShellTool20250124", 21 | "ComputerTool20241022", 22 | "ComputerTool20250124", 23 | "CrossPlatformEditTool20241022", 24 | "CrossPlatformEditTool20250124", 25 | "TOOL_GROUPS", 26 | "TOOL_GROUPS_BY_VERSION", 27 | "ToolVersion", 28 | "BetaFlag", 29 | ] 30 | 31 | # Aliases for backwards compatibility 32 | from .bash import ShellTool20250124 as BashTool20250124 33 | from .bash import ShellTool20241022 as BashTool20241022 34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124 35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022 36 | 37 | __all__ += [ 38 | "BashTool20250124", 39 | "BashTool20241022", 40 | "EditTool20250124", 41 | "EditTool20241022", 42 | ] -------------------------------------------------------------------------------- /game_agent/cradle/claude_cua/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult 7 | from .collection import ToolCollection 8 | from .bash import ShellTool20241022, ShellTool20250124 9 | from .computer import ComputerTool20241022, ComputerTool20250124 10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag 12 | 13 | __all__ = [ 14 | "BaseAnthropicTool", 15 | "ToolResult", 16 | "ToolError", 17 | "ToolFailure", 18 | "ToolCollection", 19 | "ShellTool20241022", 20 | "ShellTool20250124", 21 | "ComputerTool20241022", 22 | "ComputerTool20250124", 23 | "CrossPlatformEditTool20241022", 24 | "CrossPlatformEditTool20250124", 25 | "TOOL_GROUPS", 26 | "TOOL_GROUPS_BY_VERSION", 27 | "ToolVersion", 28 | "BetaFlag", 29 | ] 30 | 31 | # Aliases for backwards compatibility 32 | from .bash import ShellTool20250124 as BashTool20250124 33 | from .bash import ShellTool20241022 as BashTool20241022 34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124 35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022 36 | 37 | __all__ += [ 38 | "BashTool20250124", 39 | "BashTool20241022", 40 | "EditTool20250124", 41 | "EditTool20241022", 42 | ] -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/claude_cua/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross-platform computer interaction tools for Anthropic AI. 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux. 4 | """ 5 | 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult 7 | from .collection import ToolCollection 8 | from .bash import ShellTool20241022, ShellTool20250124 9 | from .computer import ComputerTool20241022, ComputerTool20250124 10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124 11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag 12 | 13 | __all__ = [ 14 | "BaseAnthropicTool", 15 | "ToolResult", 16 | "ToolError", 17 | "ToolFailure", 18 | "ToolCollection", 19 | "ShellTool20241022", 20 | "ShellTool20250124", 21 | "ComputerTool20241022", 22 | "ComputerTool20250124", 23 | "CrossPlatformEditTool20241022", 24 | "CrossPlatformEditTool20250124", 25 | "TOOL_GROUPS", 26 | "TOOL_GROUPS_BY_VERSION", 27 | "ToolVersion", 28 | "BetaFlag", 29 | ] 30 | 31 | # Aliases for backwards compatibility 32 | from .bash import ShellTool20250124 as BashTool20250124 33 | from .bash import ShellTool20241022 as BashTool20241022 34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124 35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022 36 | 37 | __all__ += [ 38 | "BashTool20250124", 39 | "BashTool20241022", 40 | "EditTool20250124", 41 | "EditTool20241022", 42 | ] -------------------------------------------------------------------------------- /game_agent/cradle/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from agent.cradle import run_game_agent 3 | import json 4 | import time 5 | 6 | def select_game_from_json(prompt_file_path="./json/game_prompts.json"): 7 | with open(prompt_file_path, "r", encoding="utf-8") as f: 8 | game_data = json.load(f) 9 | 10 | game_names = list(game_data.keys()) 11 | 12 | print("🎮 Select a game to play:") 13 | for idx, name in enumerate(game_names, start=1): 14 | print(f"{idx}. {name}") 15 | 16 | while True: 17 | try: 18 | choice = int(input("\nEnter number ▶ ")) 19 | if 1 <= choice <= len(game_names): 20 | return game_names[choice - 1] 21 | else: 22 | print("❌ Invalid number. Please try again.") 23 | except ValueError: 24 | print("❌ Please enter a number.") 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--model", default="gpt-4") 30 | parser.add_argument("--provider", default="openai") 31 | parser.add_argument("--cua", default="gpt") 32 | parser.add_argument("--max_actions", default=1000) 33 | args = parser.parse_args() 34 | 35 | game_name = select_game_from_json("./json/game_prompts.json") 36 | 37 | print("\n⏳ Preparing game... please wait.") 38 | time.sleep(5) # ✅ Delay before execution 39 | 40 | result = run_game_agent( 41 | api_provider=args.provider, 42 | model_name=args.model, 43 | game_name=game_name, 44 | cua=args.cua, 45 | max_actions=args.max_actions 46 | ) 47 | 48 | print("\n📦 Final execution result:") 49 | print(result) 50 | 51 | if __name__ == "__main__": 52 | main() -------------------------------------------------------------------------------- /game_agent/cradle/agent/cradle/info_gathering.py: -------------------------------------------------------------------------------- 1 | from api import api_caller 2 | 3 | def info_gather(system_prompt, api_provider, model_name, before_encoded): 4 | """ 5 | 현재 화면을 캡처하고 AI로 분석하여 화면 정보를 수집합니다. 6 | capture this screen and gather information 7 | """ 8 | 9 | 10 | prompt = """ 11 | The following image is a screenshot of the current computer screen.\n 12 | Carefully observe the screen and identify all **key visual elements**, such as:\n\n 13 | 14 | - Visible text (e.g., labels, instructions, titles, tooltips)\n 15 | - Interactive elements (e.g., buttons, icons, menus, input fields, sliders)\n 16 | - Status indicators or feedback messages\n 17 | - Any notable layout structures or visual groupings\n\n 18 | 19 | Objectives:\n 20 | 1. **Summarize** the most important information presented on the screen.\n 21 | - What is the screen showing? What appears to be the current context or purpose?\n 22 | - Is there any indication of user progress, task status, or instructions?\n 23 | 24 | 2. **List all possible user actions** based on what's visible.\n 25 | - Describe the actions in a clear and specific way (e.g., “Click the ‘Submit’ button”, “Type text into the search field”).\n 26 | - Include both obvious and subtle affordances, such as hovering, scrolling, or expanding menus.\n\n 27 | 28 | Be as detailed and comprehensive as possible. Your analysis should help another agent understand what this screen is about and what can be done on it.\n 29 | """ 30 | 31 | 32 | # API caller 33 | response = api_caller(api_provider, system_prompt, model_name, prompt, before_encoded) 34 | 35 | return response 36 | 37 | 38 | if __name__ == "__main__": 39 | result = info_gather() 40 | print(result) -------------------------------------------------------------------------------- /game_agent/coast/api/api_caller.py: -------------------------------------------------------------------------------- 1 | from api.serving import anthropic_completion, openai_completion, gemini_completion 2 | 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_images=None): 4 | """ 5 | Unified API caller for multiple model providers. 6 | 7 | Parameters: 8 | - api_provider (str): "anthropic", "openai", or "gemini" 9 | - system_prompt (str): System-level instruction 10 | - model_name (str): Model identifier (e.g., "gpt-4", "claude-3") 11 | - move_prompts (str): Main user prompt for this action 12 | - base64_images (str | list[str] | None): Single base64 image or list of them 13 | 14 | Returns: 15 | - response (str): Textual result from model 16 | """ 17 | 18 | # --- Normalize image input --- 19 | if isinstance(base64_images, str): 20 | base64_images = [base64_images] 21 | elif base64_images is None: 22 | base64_images = [] 23 | elif not isinstance(base64_images, list): 24 | raise TypeError("base64_images must be a base64 string, a list of strings, or None.") 25 | 26 | if not all(isinstance(img, str) for img in base64_images): 27 | raise ValueError("Each item in base64_images must be a string.") 28 | 29 | # --- Dispatch based on provider --- 30 | if api_provider == "anthropic": 31 | return anthropic_completion(system_prompt, model_name, base64_images, move_prompts) 32 | 33 | elif api_provider == "openai": 34 | return openai_completion(system_prompt, model_name, base64_images, move_prompts) 35 | 36 | elif api_provider == "gemini": 37 | return gemini_completion(system_prompt, model_name, base64_images, move_prompts) 38 | 39 | else: 40 | raise NotImplementedError(f"Unsupported API provider: '{api_provider}'") -------------------------------------------------------------------------------- /game_agent/cradle/api/api_caller.py: -------------------------------------------------------------------------------- 1 | from api.serving import anthropic_completion, openai_completion, gemini_completion 2 | 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_images=None): 4 | """ 5 | Unified API caller for multiple model providers. 6 | 7 | Parameters: 8 | - api_provider (str): "anthropic", "openai", or "gemini" 9 | - system_prompt (str): System-level instruction 10 | - model_name (str): Model identifier (e.g., "gpt-4", "claude-3") 11 | - move_prompts (str): Main user prompt for this action 12 | - base64_images (str | list[str] | None): Single base64 image or list of them 13 | 14 | Returns: 15 | - response (str): Textual result from model 16 | """ 17 | 18 | # --- Normalize image input --- 19 | if isinstance(base64_images, str): 20 | base64_images = [base64_images] 21 | elif base64_images is None: 22 | base64_images = [] 23 | elif not isinstance(base64_images, list): 24 | raise TypeError("base64_images must be a base64 string, a list of strings, or None.") 25 | 26 | if not all(isinstance(img, str) for img in base64_images): 27 | raise ValueError("Each item in base64_images must be a string.") 28 | 29 | # --- Dispatch based on provider --- 30 | if api_provider == "anthropic": 31 | return anthropic_completion(system_prompt, model_name, base64_images, move_prompts) 32 | 33 | elif api_provider == "openai": 34 | return openai_completion(system_prompt, model_name, base64_images, move_prompts) 35 | 36 | elif api_provider == "gemini": 37 | return gemini_completion(system_prompt, model_name, base64_images, move_prompts) 38 | 39 | else: 40 | raise NotImplementedError(f"Unsupported API provider: '{api_provider}'") -------------------------------------------------------------------------------- /game_agent/gpt_operator/README.md: -------------------------------------------------------------------------------- 1 | # **FlashAdventure: An Agent for Flash Game Environments - GPT Operator** 2 | 3 | This guide covers the execution of the GPT Operator game agent developed for research on autonomous agents in Flash game environments. 4 | 5 | All core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent. 6 | 7 | ----- 8 | 9 | ### **1. Agent Execution** 10 | 11 | #### **1.1 The Run Script (`run.bash`)** 12 | 13 | First, navigate to the `game_agent/gpt_operator/` directory. Then, create a file named `run.bash` and add the following script. This script automatically handles loading game prompts and setting history parameters. 14 | 15 | ```bash 16 | #!/bin/bash 17 | 18 | PYTHON=python 19 | MAIN_SCRIPT="main.py" 20 | DEFAULT_PROMPT_FILE="game_prompts.json" 21 | HISTORY=10 22 | 23 | echo "🚀 Starting agent automation loop..." 24 | 25 | # Automatically use the tasks.json file if it exists 26 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then 27 | echo "📝 $DEFAULT_PROMPT_FILE detected, using prompt automatically" 28 | $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE" --history $HISTORY 29 | else 30 | echo "📄 $DEFAULT_PROMPT_FILE not found, running with default prompt" 31 | $PYTHON $MAIN_SCRIPT --history $HISTORY 32 | fi 33 | ``` 34 | 35 | #### **1.2 Run the Agent** 36 | 37 | To start the agent, first grant execution permissions to the script and then run it from your terminal: 38 | 39 | ```bash 40 | chmod +x run.bash # (Run this once to grant permissions) 41 | ./run.bash 42 | ``` 43 | 44 | ----- 45 | 46 | ### **2. Execution Summary** 47 | 48 | 1. **Run Script:** Create and configure the `run.bash` script with your desired parameters. 49 | 2. **Launch:** Execute `./run.bash` to start the agent. -------------------------------------------------------------------------------- /evaluator/eval_game/eval_idol.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_affection(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_idol(): 17 | game = "Idol Days Sim Date" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_affection(conversation) 39 | 40 | if affection is not None: 41 | print(f"Highest Experience Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "idol_days_sim_date", 45 | "result": f"EXP: {affection}", 46 | "EXP": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_idol.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_idol.json") 53 | else: 54 | print("⚠️ Couldn't find EXP score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_idol()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_pico.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_affection(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Affection:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_pico(): 17 | game = "Pico Sim Date" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_affection(conversation) 39 | 40 | if affection is not None: 41 | print(f"💖 Affection Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "pico_sim_date", 45 | "result": f"Affection: {affection}", 46 | "affection": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_pico.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_pico.json") 53 | else: 54 | print("⚠️ Couldn't find affection score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_pico()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_grim1.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_item(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Found Items:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_grim1(): 17 | game = "Grim Tales: The Bride" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_item(conversation) 39 | 40 | if affection is not None: 41 | print(f"Highest Experience Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "grim_tales_the_bride", 45 | "result": f"Items: {affection}", 46 | "Items": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_grim1.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_grim1.json") 53 | else: 54 | print("⚠️ Couldn't find Found Items in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_grim1()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_kingdom.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_affection(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_kingdom(): 17 | game = "Kingdom Days" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_affection(conversation) 39 | 40 | if affection is not None: 41 | print(f"Highest Experience Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "kingdom_days", 45 | "result": f"EXP: {affection}", 46 | "EXP": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_kingdom.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_kingdom.json") 53 | else: 54 | print("⚠️ Couldn't find EXP score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_kingdom()) -------------------------------------------------------------------------------- /game_agent/claude_computer_use/README.md: -------------------------------------------------------------------------------- 1 | # **FlashAdventure: An Agent for Flash Game Environments - Claude Computer Use** 2 | 3 | This guide covers the execution of the Claude Computer Use game agent developed for research on autonomous agents in Flash game environments. 4 | 5 | All core setup, including creating the Python environment and configuring API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent. 6 | 7 | ----- 8 | 9 | ### **1. Agent Execution** 10 | 11 | #### **1.1 The Run Script (`run.bash`)** 12 | 13 | First, navigate to the `game_agent/claude_computer_use/` directory. Create a file named `run.bash` and add the following script. This script defines key execution parameters and then launches the agent. 14 | 15 | ```bash 16 | #!/bin/bash 17 | 18 | # Ensure Python can find the agent modules in the parent directory 19 | export PYTHONPATH=$(pwd)/.. 20 | 21 | # Configuration for the agent 22 | # The file containing game-specific prompts 23 | TASK_FILE="./json/game_prompt.json" 24 | # The type of prompt to use from the file (e.g., "prompt") 25 | PROMPT_TYPE="prompt" 26 | # The maximum number of actions the agent will take in a single session 27 | MAX_ACTIONS=1000 28 | 29 | # Execute the main agent script with the defined parameters 30 | python main.py \ 31 | --task-file "$TASK_FILE" \ 32 | --prompt-type "$PROMPT_TYPE" \ 33 | --max-actions "$MAX_ACTIONS" 34 | ``` 35 | 36 | #### **1.2 Run the Agent** 37 | 38 | To start the agent, first grant execute permissions to the script and then run it from your terminal: 39 | 40 | ```bash 41 | chmod +x run.bash # (Run this once to grant permissions) 42 | ./run.bash 43 | ``` 44 | 45 | ----- 46 | 47 | ### **2. Execution Summary** 48 | 49 | 1. **Run Script:** Create and configure the `run.bash` script with your desired parameters. 50 | 2. **Launch:** Execute `./run.bash` to start the agent. -------------------------------------------------------------------------------- /evaluator/eval_game/eval_college.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_affection(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Combined Score:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_college(): 17 | game = "Community College Sim" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_affection(conversation) 39 | 40 | if affection is not None: 41 | print(f"Combined Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "college_sim", 45 | "result": f"Combined Score: {affection}", 46 | "Combined Score": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_college.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_college.json") 53 | else: 54 | print("⚠️ Couldn't find Combined Score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_college()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_festival.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_affection(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_festival(): 17 | game = "Festival Days Sim Date" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_affection(conversation) 39 | 40 | if affection is not None: 41 | print(f"Highest Experience Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "festival_sim_date", 45 | "result": f"EXP: {affection}", 46 | "EXP": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_festival.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_festival.json") 53 | else: 54 | print("⚠️ Couldn't find affection score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_festival()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_smalltown.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_places(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Found Place:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_smalltown(): 17 | game = "Small Town Detective" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_places(conversation) 39 | 40 | if affection is not None: 41 | print(f"Found Place: {affection}") 42 | 43 | result_obj = { 44 | "game": "small_town_detective", 45 | "result": f"Found Place: {affection}", 46 | "found_place": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_smalltown.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_smalltown.json") 53 | else: 54 | print("⚠️ Couldn't find affection score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_smalltown()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_grim2.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_item(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Found Items:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_grim2(): 17 | game = "Grim Tales: The Legacy Collector's Edition" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_item(conversation) 39 | 40 | if affection is not None: 41 | print(f"Highest Experience Score: {affection}") 42 | 43 | result_obj = { 44 | "game": "grim_tales_the_legacy_collectors_edition", 45 | "result": f"Items: {affection}", 46 | "Items": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_grim2.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_grim2.json") 53 | else: 54 | print("⚠️ Couldn't find Found Items in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_grim2()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_nickbounty.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_places(conversation: List[str]) -> float | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Found Place:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE) 12 | if match: 13 | return float(match.group(1)) 14 | return None 15 | 16 | async def eval_nickbounty(): 17 | game = "Nick Bounty: A Case of the Crabs" 18 | prompt_key = "milestone_prompt1" 19 | 20 | # Load prompt from milestone_prompts.json 21 | with open("milestone_prompts.json", "r") as f: 22 | all_data = json.load(f) 23 | 24 | data = all_data.get(game, {}) 25 | if not data or prompt_key not in data: 26 | print(f"❌ No milestone data found for {game} / {prompt_key}") 27 | return 28 | 29 | print(f"🎮 Running evaluation for: {game}") 30 | 31 | # Run agent 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | affection = extract_places(conversation) 39 | 40 | if affection is not None: 41 | print(f"Found Place: {affection}") 42 | 43 | result_obj = { 44 | "game": "nick_bounty_a_case_of_the_crabs", 45 | "result": f"Found Place: {affection}", 46 | "found_place": affection 47 | } 48 | 49 | Path("results").mkdir(exist_ok=True) 50 | with open("results/result_nickbounty.json", "w") as f: 51 | json.dump(result_obj, f, indent=2) 52 | print("📝 Result saved to results/result_nickbounty.json") 53 | else: 54 | print("⚠️ Couldn't find affection score in result.") 55 | 56 | if __name__ == "__main__": 57 | asyncio.run(eval_nickbounty()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_sherlock.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from eval_game.eval_utils import run_milestone 5 | from pathlib import Path 6 | from typing import List 7 | 8 | def extract_result(conversation: List[str]) -> str | None: 9 | for msg in reversed(conversation): 10 | if "New Suspect:" in msg: 11 | return msg.strip() 12 | return None 13 | 14 | def get_suspect_count(result_text: str) -> int: 15 | match = re.search(r"New Suspect:\s*(\d+)", result_text, re.IGNORECASE) 16 | return int(match.group(1)) if match else 0 17 | 18 | async def eval_sherlock(): 19 | game = "sherlock_holmes_the_tea_shop_murder_mystery" 20 | prompt_key = "milestone_prompt1" 21 | 22 | with open("milestone_prompts.json", "r") as f: 23 | all_data = json.load(f) 24 | 25 | data = all_data.get(game, {}) 26 | if not data: 27 | print(f"❌ No data found for game: {game}") 28 | return 29 | 30 | print(f"🕵️ Running evaluation for: Sherlock Holmes") 31 | 32 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 33 | 34 | if conversation is None: 35 | print("⚠️ No conversation returned from agent.") 36 | return 37 | 38 | result_text = extract_result(conversation) 39 | 40 | if result_text: 41 | count = get_suspect_count(result_text) 42 | print(f"🔍 Final Result: {result_text}") 43 | print(f"🧮 Counted Suspects: {count}") 44 | 45 | # ✅ Save Result 46 | result_obj = { 47 | "game": "sherlock", 48 | "result": result_text, 49 | "#New Suspects": count 50 | } 51 | 52 | Path("results").mkdir(exist_ok=True) 53 | with open("results/result_sherlock.json", "w") as f: 54 | json.dump(result_obj, f, indent=2) 55 | print("📝 Result saved to results/result_sherlock.json") 56 | 57 | else: 58 | print("⚠️ No 'New Suspect' result found.") 59 | 60 | if __name__ == "__main__": 61 | asyncio.run(eval_sherlock()) -------------------------------------------------------------------------------- /game_agent/UI-Tars/tools/screenshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mss 3 | 4 | def get_screenshot_dir(base_dir, cua, model_name, game_name): 5 | """Creates a directory based on game/model/agent.""" 6 | directory = os.path.join(base_dir, cua, model_name, game_name) 7 | os.makedirs(directory, exist_ok=True) 8 | return directory 9 | 10 | def get_next_screenshot_filename(directory): 11 | """Generates the next sequential screenshot filename in the given directory.""" 12 | existing_files = [ 13 | f for f in os.listdir(directory) 14 | if f.startswith("flash_screenshot_") and f.endswith(".png") 15 | ] 16 | 17 | numbers = [] 18 | for filename in existing_files: 19 | try: 20 | num_str = filename.replace("flash_screenshot_", "").replace(".png", "") 21 | numbers.append(int(num_str)) 22 | except ValueError: 23 | continue 24 | 25 | next_num = max(numbers, default=0) + 1 26 | return f"flash_screenshot_{next_num:04d}.png" 27 | 28 | def capture_flash_screenshot(game_name, cua, model_name, time=None): 29 | """ 30 | Captures the entire screen and saves it to a folder based on GUI agent / model. 31 | - time=None or "": screenshots/ 32 | - time="after": screenshots_after/ 33 | - time="final": screenshots_final/ 34 | """ 35 | if time not in (None, "", "after", "final"): 36 | raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.") 37 | 38 | if time == "after": 39 | base_dir = "screenshots_after" 40 | elif time == "final": 41 | base_dir = "screenshots_final" 42 | else: 43 | base_dir = "screenshots" 44 | 45 | directory = get_screenshot_dir(base_dir, cua, model_name, game_name) 46 | filename = get_next_screenshot_filename(directory) 47 | screenshot_path = os.path.join(directory, filename) 48 | 49 | with mss.mss() as sct: 50 | monitor = sct.monitors[1] 51 | screenshot = sct.grab(monitor) 52 | mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path) 53 | 54 | print(f"[INFO] Screenshot saved to: {screenshot_path}") 55 | return screenshot_path -------------------------------------------------------------------------------- /game_agent/cradle/tools/screenshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mss 3 | 4 | def get_screenshot_dir(base_dir, cua, model_name, game_name): 5 | """Create directory based on game / model / agent""" 6 | directory = os.path.join(base_dir, cua, model_name, game_name) 7 | os.makedirs(directory, exist_ok=True) 8 | return directory 9 | 10 | def get_next_screenshot_filename(directory): 11 | """Generate the next sequential screenshot filename in the given directory""" 12 | existing_files = [ 13 | f for f in os.listdir(directory) 14 | if f.startswith("flash_screenshot_") and f.endswith(".png") 15 | ] 16 | 17 | numbers = [] 18 | for filename in existing_files: 19 | try: 20 | num_str = filename.replace("flash_screenshot_", "").replace(".png", "") 21 | numbers.append(int(num_str)) 22 | except ValueError: 23 | continue 24 | 25 | next_num = max(numbers, default=0) + 1 26 | return f"flash_screenshot_{next_num:04d}.png" 27 | 28 | def capture_flash_screenshot(game_name, cua, model_name, time=None): 29 | """ 30 | Capture the full screen and save it into a folder structured by GUI agent / model. 31 | - time=None or "": screenshots/ 32 | - time="after": screenshots_after/ 33 | - time="final": screenshots_final/ 34 | """ 35 | if time not in (None, "", "after", "final"): 36 | raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.") 37 | 38 | if time == "after": 39 | base_dir = "screenshots_after" 40 | elif time == "final": 41 | base_dir = "screenshots_final" 42 | else: 43 | base_dir = "screenshots" 44 | 45 | directory = get_screenshot_dir(base_dir, cua, model_name, game_name) 46 | filename = get_next_screenshot_filename(directory) 47 | screenshot_path = os.path.join(directory, filename) 48 | 49 | with mss.mss() as sct: 50 | monitor = sct.monitors[1] 51 | screenshot = sct.grab(monitor) 52 | mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path) 53 | 54 | print(f"[INFO] Screenshot saved to: {screenshot_path}") 55 | return screenshot_path -------------------------------------------------------------------------------- /evaluator/judge/vlm/screenshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import mss 4 | import time 5 | import subprocess 6 | 7 | 8 | # OS 감지 9 | IS_MAC = platform.system() == "Darwin" 10 | 11 | # 🔹 스크린샷 저장 디렉토리 설정 (기본: screenshots/flashpoint/) 12 | SCREENSHOT_DIR = "screenshots/flashpoint/" 13 | 14 | # 저장 경로 디렉토리 생성 15 | os.makedirs(SCREENSHOT_DIR, exist_ok=True) 16 | 17 | 18 | def get_flashpoint_window_position(): 19 | print(f"[INFO] Detecting Flashpoint window on {platform.system()}...") 20 | 21 | if IS_MAC: 22 | script = ''' 23 | tell application "System Events" 24 | set window_list to name of every window of every process whose visible is true 25 | end tell 26 | return window_list 27 | ''' 28 | result = subprocess.run(["osascript", "-e", script], capture_output=True, text=True) 29 | windows = result.stdout.strip().split(", ") 30 | for window in windows: 31 | if "Flashpoint" in window: 32 | return 100, 100, 800, 600 # 기본값 33 | else: 34 | try: 35 | import pygetwindow as gw 36 | windows = gw.getWindowsWithTitle("Flashpoint") 37 | if windows: 38 | window = windows[0] 39 | return window.left, window.top, window.width, window.height 40 | except ImportError: 41 | print("[ERROR] pygetwindow is not installed. Run: pip install pygetwindow") 42 | return None 43 | 44 | def capture_flash_screenshot(): 45 | position = get_flashpoint_window_position() 46 | timestamp = time.strftime("%Y%m%d_%H%M%S") 47 | screenshot_path = os.path.join(SCREENSHOT_DIR, f"flash_screenshot_{timestamp}.png") 48 | 49 | with mss.mss() as sct: 50 | if position: 51 | left, top, width, height = position 52 | monitor = {"top": top, "left": left, "width": width, "height": height} 53 | else: 54 | monitor = sct.monitors[1] 55 | 56 | screenshot = sct.grab(monitor) 57 | mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path) 58 | print(f"[INFO] Screenshot saved: {screenshot_path}") 59 | 60 | return screenshot_path 61 | -------------------------------------------------------------------------------- /game_agent/coast/tools/screenshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mss 3 | 4 | def get_screenshot_dir(base_dir, reasoning_model, gui_agent, game_name): 5 | """Creates a directory based on game/model/agent.""" 6 | directory = os.path.join(base_dir, gui_agent, reasoning_model, game_name) 7 | os.makedirs(directory, exist_ok=True) 8 | return directory 9 | 10 | def get_next_screenshot_filename(directory): 11 | """Generates the next sequential screenshot filename in the given directory.""" 12 | existing_files = [ 13 | f for f in os.listdir(directory) 14 | if f.startswith("flash_screenshot_") and f.endswith(".png") 15 | ] 16 | 17 | numbers = [] 18 | for filename in existing_files: 19 | try: 20 | num_str = filename.replace("flash_screenshot_", "").replace(".png", "") 21 | numbers.append(int(num_str)) 22 | except ValueError: 23 | continue 24 | 25 | next_num = max(numbers, default=0) + 1 26 | return f"flash_screenshot_{next_num:04d}.png" 27 | 28 | def capture_flash_screenshot(game_name, gui_model, reasoning_model, time=None): 29 | """ 30 | Captures the entire screen and saves it to a folder based on GUI agent / model. 31 | - time=None or "": screenshots/ 32 | - time="after": screenshots_after/ 33 | - time="final": screenshots_final/ 34 | """ 35 | if time not in (None, "", "after", "final"): 36 | raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.") 37 | 38 | if time == "after": 39 | base_dir = "screenshots_after" 40 | elif time == "final": 41 | base_dir = "screenshots_final" 42 | else: 43 | base_dir = "screenshots" 44 | 45 | directory = get_screenshot_dir(base_dir, gui_model, reasoning_model, game_name) 46 | filename = get_next_screenshot_filename(directory) 47 | screenshot_path = os.path.join(directory, filename) 48 | 49 | with mss.mss() as sct: 50 | monitor = sct.monitors[1] 51 | screenshot = sct.grab(monitor) 52 | mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path) 53 | 54 | print(f"[INFO] Screenshot saved to: {screenshot_path}") 55 | return screenshot_path -------------------------------------------------------------------------------- /evaluator/eval_game/eval_videostudio.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List, Optional 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | GREEN_RE = re.compile(r"Green\s*Lights\s*:\s*(\d+(?:\.\d+)?)", re.IGNORECASE) 10 | 11 | def extract_green_lights(conversation: List[str]) -> Optional[float]: 12 | for msg in reversed(conversation or []): 13 | m = GREEN_RE.search(msg) 14 | if m: 15 | return float(m.group(1)) 16 | return None 17 | 18 | async def eval_videostudio(): 19 | game = "Video Studio Escape" 20 | 21 | # Load prompt from milestone_prompts.json 22 | with open("milestone_prompts.json", "r", encoding="utf-8") as f: 23 | all_data = json.load(f) 24 | 25 | data = all_data.get(game) 26 | if not data: 27 | print(f"❌ No data found for game: {game}") 28 | return 29 | 30 | print(f"🎮 Running evaluation for: {game}") 31 | 32 | # 1) Instruction 33 | if "Instruction" in data: 34 | _ = await run_milestone(data["Instruction"], f"{game}_Instruction") 35 | 36 | # 2) milestone_prompt1 37 | if "milestone_prompt1" not in data: 38 | print(f"❌ No milestone data found for {game} / milestone_prompt1") 39 | return 40 | 41 | conversation = await run_milestone(data["milestone_prompt1"], f"{game}_milestone_prompt1") 42 | if conversation is None: 43 | print("⚠️ No conversation returned from agent.") 44 | return 45 | 46 | green = extract_green_lights(conversation) 47 | if green is not None: 48 | print(f"Green Lights: {green}") 49 | 50 | result_obj = { 51 | "game": "video_studio_escape", 52 | "result": f"green_lights: {green}", 53 | "green_lights": green, 54 | } 55 | 56 | Path("results").mkdir(exist_ok=True) 57 | with open("results/result_videostudio.json", "w", encoding="utf-8") as f: 58 | json.dump(result_obj, f, indent=2, ensure_ascii=False) 59 | print("📝 Result saved to results/result_videostudio.json") 60 | else: 61 | print("⚠️ Couldn't find green_lights in result.") 62 | 63 | if __name__ == "__main__": 64 | asyncio.run(eval_videostudio()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_paint.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List, Optional 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | DOOR_INDEX_RE = re.compile(r"Door\s*Index\s*:\s*(\d+(?:\.\d+)?)", re.IGNORECASE) 10 | 11 | def extract_door_index(conversation: List[str]) -> Optional[float]: 12 | for msg in reversed(conversation or []): 13 | m = DOOR_INDEX_RE.search(msg) 14 | if m: 15 | return float(m.group(1)) 16 | return None 17 | 18 | async def eval_paint(): 19 | game = "Paint Room Escape" 20 | 21 | # Load prompt from milestone_prompts.json 22 | with open("milestone_prompts.json", "r", encoding="utf-8") as f: 23 | all_data = json.load(f) 24 | 25 | data = all_data.get(game, {}) 26 | if not data: 27 | print(f"❌ No data found for game: {game}") 28 | return 29 | 30 | print(f"🎮 Running evaluation for: {game}") 31 | 32 | # 1) Instruction 33 | if "Instruction" in data: 34 | _ = await run_milestone(data["Instruction"], f"{game}_Instruction") 35 | 36 | # 2) milestone_prompt1 37 | prompt_key = "milestone_prompt1" 38 | if prompt_key not in data: 39 | print(f"❌ No milestone data found for {game} / {prompt_key}") 40 | return 41 | 42 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 43 | if conversation is None: 44 | print("⚠️ No conversation returned from agent.") 45 | return 46 | 47 | door_index = extract_door_index(conversation) 48 | 49 | if door_index is not None: 50 | print(f"Door color index: {door_index}") 51 | 52 | result_obj = { 53 | "game": "paint_room_escape", 54 | "result": f"door_color_index: {door_index}", 55 | "door_color_index": door_index, 56 | } 57 | 58 | Path("results").mkdir(exist_ok=True) 59 | with open("results/result_paint.json", "w", encoding="utf-8") as f: 60 | json.dump(result_obj, f, indent=2, ensure_ascii=False) 61 | print("📝 Result saved to results/result_paint.json") 62 | else: 63 | print("⚠️ Couldn't find door_color_index in result.") 64 | 65 | if __name__ == "__main__": 66 | asyncio.run(eval_paint()) -------------------------------------------------------------------------------- /game_agent/UI-Tars/mm_agents/README.md: -------------------------------------------------------------------------------- 1 | # Agent 2 | ## Prompt-based Agents 3 | 4 | ### Supported Models 5 | We currently support the following models as the foundational models for the agents: 6 | - `GPT-3.5` (gpt-3.5-turbo-16k, ...) 7 | - `GPT-4` (gpt-4-0125-preview, gpt-4-1106-preview, ...) 8 | - `GPT-4V` (gpt-4-vision-preview, ...) 9 | - `Gemini-Pro` 10 | - `Gemini-Pro-Vision` 11 | - `Claude-3, 2` (claude-3-haiku-2024030, claude-3-sonnet-2024022, ...) 12 | - ... 13 | 14 | And those from the open-source community: 15 | - `Mixtral 8x7B` 16 | - `QWEN`, `QWEN-VL` 17 | - `CogAgent` 18 | - `Llama3` 19 | - ... 20 | 21 | In the future, we will integrate and support more foundational models to enhance digital agents, so stay tuned. 22 | 23 | ### How to use 24 | 25 | ```python 26 | from mm_agents.agent import PromptAgent 27 | 28 | agent = PromptAgent( 29 | model="gpt-4-vision-preview", 30 | observation_type="screenshot", 31 | ) 32 | agent.reset() 33 | # say we have an instruction and observation 34 | instruction = "Please help me to find the nearest restaurant." 35 | obs = {"screenshot": open("path/to/observation.jpg", 'rb').read()} 36 | response, actions = agent.predict( 37 | instruction, 38 | obs 39 | ) 40 | ``` 41 | 42 | ### Observation Space and Action Space 43 | We currently support the following observation spaces: 44 | - `a11y_tree`: the accessibility tree of the current screen 45 | - `screenshot`: a screenshot of the current screen 46 | - `screenshot_a11y_tree`: a screenshot of the current screen with the accessibility tree overlay 47 | - `som`: the set-of-mark trick on the current screen, with table metadata included. 48 | 49 | And the following action spaces: 50 | - `pyautogui`: valid Python code with `pyautogui` code valid 51 | - `computer_13`: a set of enumerated actions designed by us 52 | 53 | To feed an observation into the agent, you have to maintain the `obs` variable as a dict with the corresponding information: 54 | ```python 55 | # continue from the previous code snippet 56 | obs = { 57 | "screenshot": open("path/to/observation.jpg", 'rb').read(), 58 | "a11y_tree": "" # [a11y_tree data] 59 | } 60 | response, actions = agent.predict( 61 | instruction, 62 | obs 63 | ) 64 | ``` 65 | 66 | ## Efficient Agents, Q* Agents, and more 67 | Stay tuned for more updates. 68 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/README.md: -------------------------------------------------------------------------------- 1 | # **FlashAdventure: An Agent for Flash Game Environments - UI-TARS** 2 | 3 | This guide covers the execution of the UI-TARS game agent, which is based on the OSWorld source code. Developed for our research on autonomous agents in Flash game environments, this guide details the steps required to configure and run the agent. 4 | 5 | The core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on the UI-TARS agent. 6 | 7 | ## **1. Agent Execution** 8 | 9 | ### **1.1 Launch the VLLM Model Server** 10 | 11 | The UI-TARS agent requires a VLLM model server to be running. First, launch the model server on a remote GPU machine using the following command: 12 | 13 | ```bash 14 | python -m vllm.entrypoints.openai.api_server --served-model-name ui-tars --model "ByteDance-Seed/UI-TARS-1.5-7B" --limit-mm-per-prompt image=10 --port 8000 15 | ``` 16 | 17 | ### **1.2 Configure the Agent's API Endpoint** 18 | 19 | Next, you must configure the agent to connect to your running server. Open the `mm_agents/uitars_agent.py` file and modify the `base_url` to point to your server's address. 20 | 21 | ```python 22 | ## Line 574 23 | self.vlm = OpenAI( 24 | base_url="http://your_server_url:8000/v1", api_key="empty" 25 | ) 26 | ``` 27 | 28 | ### **1.3 The Run Script (`run.bash`)** 29 | 30 | Now, create a `run.bash` file in the `game_agent/UI-Tars/` directory with the following content. This simple script will execute the main entry point for the agent. 31 | 32 | ```bash 33 | #!/bin/bash 34 | 35 | # Execute the main agent script 36 | python run_uitars.py 37 | ``` 38 | 39 | ### **1.4 Run the Agent** 40 | 41 | To start the agent, first grant execution permissions to the script and then run it from your terminal: 42 | 43 | ```bash 44 | chmod +x run.bash # (Run this once to grant permissions) 45 | ./run.bash 46 | ``` 47 | 48 | ----- 49 | 50 | ### **2. Execution Summary** 51 | 52 | 1. **Load Model:** Launch the UI-Tars model server on your remote GPU. 53 | 2. **Configure Agent:** Update the `base_url` in `mm_agents/uitars_agent.py` to point to your server's address. 54 | 3. **Run Script:** Create and configure the `run.bash` script with your desired parameters. 55 | 4. **Launch:** Execute `./run.bash` to start the agent. -------------------------------------------------------------------------------- /game_agent/cradle/README.md: -------------------------------------------------------------------------------- 1 | I will gladly help you polish your `README` file. The current draft is a bit repetitive, as it has two separate sections (`1.1 The Run Script` and `1.2 Configuration`) that contain similar information about the agent's configuration. 2 | 3 | I will streamline this by combining all the configuration details into a single `run.bash` script. This makes the setup process much more direct and easier to follow, which is perfect for a research paper's documentation. 4 | 5 | ----- 6 | 7 | # **FlashAdventure: An Agent for Flash Game Environments - Cradle** 8 | 9 | This project presents a reproduction of the `cradle` game agent, adapted for our research on autonomous agents in Flash game environments. This guide details the steps required to configure and run the agent. 10 | 11 | All core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent. 12 | 13 | ## **1. Agent Execution** 14 | 15 | ### **1.1 The Run Script (`run.bash`)** 16 | 17 | First, navigate to the `game_agent/cradle/` directory. Then, create a `run.bash` file and add the following script. You can directly edit the variables in this script to configure the agent's behavior. 18 | 19 | ```bash 20 | #!/bin/bash 21 | 22 | # Configuration 23 | # API Provider: Select one of "anthropic" or "openai". 24 | PROVIDER="anthropic" 25 | 26 | # LLM Model: Specifies the model for high-level reasoning. 27 | MODEL="claude-3-7-sonnet-20250219" 28 | 29 | # GUI Agent Type: Select the agent responsible for mouse/keyboard operations. 30 | # "claude": A Claude agent specialized for computer control. 31 | # "sonnet": An agent that uses the original Claude Sonnet model directly. 32 | # "uground": An open-source UGround model. 33 | CUA="claude" 34 | 35 | # Execute 36 | python main.py --model "$MODEL" --provider "$PROVIDER" --cua "$CUA" 37 | ``` 38 | 39 | ### **1.2 Run the Agent** 40 | 41 | To start the agent, grant execute permissions and run the script from your terminal: 42 | 43 | ```bash 44 | chmod +x run.bash # (Run this once) 45 | ./run.bash 46 | ``` 47 | 48 | ----- 49 | 50 | ## **2. Execution Summary** 51 | 52 | 1. **Run Script:** Create and configure `run.bash` with your desired parameters. 53 | 2. **Launch:** Execute `./run.bash` to start the agent. -------------------------------------------------------------------------------- /game_agent/cradle/claude_cua/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message -------------------------------------------------------------------------------- /evaluator/judge/computer_use/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message 70 | -------------------------------------------------------------------------------- /game_agent/claude_computer_use/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message 70 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/claude_cua/tools/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from dataclasses import dataclass, fields, replace 3 | from typing import Any 4 | 5 | from anthropic.types.beta import BetaToolUnionParam 6 | 7 | 8 | class BaseAnthropicTool(metaclass=ABCMeta): 9 | """Abstract base class for Anthropic-defined tools.""" 10 | 11 | @abstractmethod 12 | def __call__(self, **kwargs) -> Any: 13 | """Executes the tool with the given arguments.""" 14 | ... 15 | 16 | @abstractmethod 17 | def to_params( 18 | self, 19 | ) -> BetaToolUnionParam: 20 | raise NotImplementedError 21 | 22 | 23 | @dataclass(kw_only=True, frozen=True) 24 | class ToolResult: 25 | """Represents the result of a tool execution.""" 26 | 27 | output: str | None = None 28 | error: str | None = None 29 | base64_image: str | None = None 30 | system: str | None = None 31 | 32 | def __bool__(self): 33 | return any(getattr(self, field.name) for field in fields(self)) 34 | 35 | def __add__(self, other: "ToolResult"): 36 | def combine_fields( 37 | field: str | None, other_field: str | None, concatenate: bool = True 38 | ): 39 | if field and other_field: 40 | if concatenate: 41 | return field + other_field 42 | raise ValueError("Cannot combine tool results") 43 | return field or other_field 44 | 45 | return ToolResult( 46 | output=combine_fields(self.output, other.output), 47 | error=combine_fields(self.error, other.error), 48 | base64_image=combine_fields(self.base64_image, other.base64_image, False), 49 | system=combine_fields(self.system, other.system), 50 | ) 51 | 52 | def replace(self, **kwargs): 53 | """Returns a new ToolResult with the given fields replaced.""" 54 | return replace(self, **kwargs) 55 | 56 | 57 | class CLIResult(ToolResult): 58 | """A ToolResult that can be rendered as a CLI output.""" 59 | 60 | 61 | class ToolFailure(ToolResult): 62 | """A ToolResult that represents a failure.""" 63 | 64 | 65 | class ToolError(Exception): 66 | """Raised when a tool encounters an error.""" 67 | 68 | def __init__(self, message): 69 | self.message = message 70 | -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from dotenv import load_dotenv 4 | import json 5 | import base64 6 | from PIL import Image 7 | from io import BytesIO 8 | import io 9 | from urllib.parse import urlparse 10 | 11 | load_dotenv(override=True) 12 | 13 | BLOCKED_DOMAINS = [ 14 | "maliciousbook.com", 15 | "evilvideos.com", 16 | "darkwebforum.com", 17 | "shadytok.com", 18 | "suspiciouspins.com", 19 | "ilanbigio.com", 20 | ] 21 | 22 | 23 | def pp(obj): 24 | print(json.dumps(obj, indent=4)) 25 | 26 | 27 | def show_image(base_64_image): 28 | image_data = base64.b64decode(base_64_image) 29 | image = Image.open(BytesIO(image_data)) 30 | image.show() 31 | 32 | 33 | def calculate_image_dimensions(base_64_image): 34 | image_data = base64.b64decode(base_64_image) 35 | image = Image.open(io.BytesIO(image_data)) 36 | return image.size 37 | 38 | 39 | def sanitize_message(msg: dict) -> dict: 40 | """Return a copy of the message with image_url omitted for computer_call_output messages.""" 41 | if msg.get("type") == "computer_call_output": 42 | output = msg.get("output", {}) 43 | if isinstance(output, dict): 44 | sanitized = msg.copy() 45 | sanitized["output"] = {**output, "image_url": "[omitted]"} 46 | return sanitized 47 | return msg 48 | 49 | 50 | def create_response(**kwargs): 51 | url = "https://api.openai.com/v1/responses" 52 | headers = { 53 | "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", 54 | "Content-Type": "application/json" 55 | } 56 | 57 | openai_org = os.getenv("OPENAI_ORG") 58 | if openai_org: 59 | headers["Openai-Organization"] = openai_org 60 | 61 | response = requests.post(url, headers=headers, json=kwargs) 62 | 63 | if response.status_code != 200: 64 | print(f"Error: {response.status_code} {response.text}") 65 | 66 | return response.json() 67 | 68 | 69 | def check_blocklisted_url(url: str) -> None: 70 | """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" 71 | hostname = urlparse(url).hostname or "" 72 | if any( 73 | hostname == blocked or hostname.endswith(f".{blocked}") 74 | for blocked in BLOCKED_DOMAINS 75 | ): 76 | raise ValueError(f"Blocked URL: {url}") 77 | -------------------------------------------------------------------------------- /game_agent/gpt_operator/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from dotenv import load_dotenv 4 | import json 5 | import base64 6 | from PIL import Image 7 | from io import BytesIO 8 | import io 9 | from urllib.parse import urlparse 10 | 11 | load_dotenv(override=True) 12 | 13 | BLOCKED_DOMAINS = [ 14 | "maliciousbook.com", 15 | "evilvideos.com", 16 | "darkwebforum.com", 17 | "shadytok.com", 18 | "suspiciouspins.com", 19 | "ilanbigio.com", 20 | ] 21 | 22 | 23 | def pp(obj): 24 | print(json.dumps(obj, indent=4)) 25 | 26 | 27 | def show_image(base_64_image): 28 | image_data = base64.b64decode(base_64_image) 29 | image = Image.open(BytesIO(image_data)) 30 | image.show() 31 | 32 | 33 | def calculate_image_dimensions(base_64_image): 34 | image_data = base64.b64decode(base_64_image) 35 | image = Image.open(io.BytesIO(image_data)) 36 | return image.size 37 | 38 | 39 | def sanitize_message(msg: dict) -> dict: 40 | """Return a copy of the message with image_url omitted for computer_call_output messages.""" 41 | if msg.get("type") == "computer_call_output": 42 | output = msg.get("output", {}) 43 | if isinstance(output, dict): 44 | sanitized = msg.copy() 45 | sanitized["output"] = {**output, "image_url": "[omitted]"} 46 | return sanitized 47 | return msg 48 | 49 | 50 | def create_response(**kwargs): 51 | url = "https://api.openai.com/v1/responses" 52 | headers = { 53 | "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", 54 | "Content-Type": "application/json" 55 | } 56 | 57 | openai_org = os.getenv("OPENAI_ORG") 58 | if openai_org: 59 | headers["Openai-Organization"] = openai_org 60 | 61 | response = requests.post(url, headers=headers, json=kwargs) 62 | 63 | if response.status_code != 200: 64 | print(f"Error: {response.status_code} {response.text}") 65 | 66 | return response.json() 67 | 68 | 69 | def check_blocklisted_url(url: str) -> None: 70 | """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" 71 | hostname = urlparse(url).hostname or "" 72 | if any( 73 | hostname == blocked or hostname.endswith(f".{blocked}") 74 | for blocked in BLOCKED_DOMAINS 75 | ): 76 | raise ValueError(f"Blocked URL: {url}") 77 | -------------------------------------------------------------------------------- /evaluator/evaluate_game.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import asyncio 3 | import os 4 | 5 | AVAILABLE_EVALS = { 6 | "sherlock": "eval_sherlock", 7 | "sherlock2": "eval_sherlock2", 8 | "small town detective": "eval_smalltown", 9 | "nick bounty a case of the crabs": "eval_nickbounty", 10 | "gamecafe": "eval_gamecafe", 11 | "paint room escape": "eval_paint", 12 | "video studio escape": "eval_videostudio", 13 | "vortex1": "eval_vortex", 14 | "vortex2": "eval_vortex2", 15 | "vortex3": "eval_vortex3", 16 | "pierre": "eval_pierre", 17 | "dakota": "eval_dakota", 18 | "saucy": "eval_saucy", 19 | "ray and cooper2": "eval_ray2", 20 | "design house escape": "eval_design", 21 | "mirror room escape": "eval_mirror", 22 | "pico sim date": "eval_pico", 23 | "festival days sim date": "eval_festival", 24 | "kingdom days": "eval_kingdom", 25 | "idol days sim date": "eval_idol", 26 | "community college sim": "eval_college", 27 | "grim tales the bride": "eval_grim1", 28 | "grim tales the legacy collectors edition": "eval_grim2", 29 | "chemical room escape": "eval_chemical", 30 | "computer office escape": "eval_computer", 31 | "crimson room": "eval_crimson", 32 | "geometric room escape": "eval_geometric", 33 | "machine room escape": "eval_machine", 34 | "sort the court": "eval_sort", 35 | "space museum escape": "eval_space", 36 | "camping room escape" : "eval_camping", 37 | "vending machine room escape": "eval_vending", 38 | "wood workshop escape": "eval_wood", 39 | "elevator room escape" : "eval_elevator" 40 | } 41 | 42 | def choose_game(): 43 | print("Available evaluations:") 44 | for i, name in enumerate(AVAILABLE_EVALS, 1): 45 | print(f"{i}. {name}") 46 | idx = int(input("Select game to evaluate: ")) - 1 47 | return list(AVAILABLE_EVALS.values())[idx] 48 | 49 | async def main(): 50 | module_path = "./eval_game" 51 | module_name = choose_game() 52 | file_path = os.path.join(module_path, f"{module_name}.py") 53 | 54 | spec = importlib.util.spec_from_file_location(module_name, file_path) 55 | module = importlib.util.module_from_spec(spec) 56 | spec.loader.exec_module(module) 57 | 58 | func_name = f"eval_{module_name.split('_')[-1]}" 59 | await module.__dict__[func_name]() 60 | 61 | if __name__ == "__main__": 62 | asyncio.run(main()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_vortex2.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_vortex2(): 17 | game = "Vortex Point2" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "vortex_point2", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | try: 37 | for i in range(1, 5): 38 | key = f"milestone_prompt{i}" 39 | result_obj["last_attempted"] = key 40 | print(f"\n🏁 {key}") 41 | 42 | convo = await run_milestone(data[key], f"{game}_{key}") 43 | result = extract_boolean_result(convo) 44 | 45 | match i: 46 | case 1: result_obj["results"]["pub_owner_gone"] = result 47 | case 2: result_obj["results"]["magician_gone"] = result 48 | case 3: result_obj["results"]["security_guard_gone"] = result 49 | case 4: result_obj["results"]["fence_open"] = result 50 | 51 | print(f"✅ Result: {result}") 52 | 53 | if result is not True: 54 | print(f"🛑 Condition failed at {key}. Stopping evaluation.") 55 | result_obj["failed_at"] = key 56 | return 57 | 58 | print("🎉 All milestones passed successfully!") 59 | 60 | except KeyboardInterrupt: 61 | print("\n⚠️ Evaluation interrupted by user.") 62 | 63 | finally: 64 | Path("results").mkdir(exist_ok=True) 65 | with open("results/result_vortex2.json", "w") as f: 66 | json.dump(result_obj, f, indent=2) 67 | print("📝 Result saved to results/result_vortex2.json") 68 | 69 | if __name__ == "__main__": 70 | asyncio.run(eval_vortex2()) -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from dotenv import load_dotenv 4 | import json 5 | import base64 6 | from PIL import Image 7 | from io import BytesIO 8 | import io 9 | from urllib.parse import urlparse 10 | 11 | load_dotenv(override=True) 12 | 13 | BLOCKED_DOMAINS = [ 14 | "maliciousbook.com", 15 | "evilvideos.com", 16 | "darkwebforum.com", 17 | "shadytok.com", 18 | "suspiciouspins.com", 19 | "ilanbigio.com", 20 | ] 21 | 22 | 23 | def pp(obj): 24 | print(json.dumps(obj, indent=4)) 25 | 26 | 27 | def show_image(base_64_image): 28 | image_data = base64.b64decode(base_64_image) 29 | image = Image.open(BytesIO(image_data)) 30 | image.show() 31 | 32 | 33 | def calculate_image_dimensions(base_64_image): 34 | image_data = base64.b64decode(base_64_image) 35 | image = Image.open(io.BytesIO(image_data)) 36 | return image.size 37 | 38 | 39 | def sanitize_message(msg: dict) -> dict: 40 | """Return a copy of the message with image_url omitted for computer_call_output messages.""" 41 | if msg.get("type") == "computer_call_output": 42 | output = msg.get("output", {}) 43 | if isinstance(output, dict): 44 | sanitized = msg.copy() 45 | sanitized["output"] = {**output, "image_url": "[omitted]"} 46 | return sanitized 47 | return msg 48 | 49 | 50 | def create_response(**kwargs): 51 | url = "https://api.openai.com/v1/responses" 52 | headers = { 53 | "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", 54 | "Content-Type": "application/json" 55 | } 56 | 57 | openai_org = os.getenv("OPENAI_ORG") 58 | if openai_org: 59 | headers["Openai-Organization"] = openai_org 60 | 61 | try: 62 | response = requests.post(url, headers=headers, json=kwargs, timeout=30) 63 | response.raise_for_status() # 4xx/5xx 오류 시 예외 발생 64 | 65 | return response.json() # 정상 시 항상 JSON이므로 예외 거의 없음 66 | 67 | except requests.exceptions.RequestException as e: 68 | print("[ERROR] OpenAI API 요청 실패:", str(e)) 69 | return {} 70 | 71 | 72 | def check_blocklisted_url(url: str) -> None: 73 | """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" 74 | hostname = urlparse(url).hostname or "" 75 | if any( 76 | hostname == blocked or hostname.endswith(f".{blocked}") 77 | for blocked in BLOCKED_DOMAINS 78 | ): 79 | raise ValueError(f"Blocked URL: {url}") 80 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_saucy.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | 10 | def extract_boolean_result(conversation: List[str]) -> bool | None: 11 | for msg in reversed(conversation): 12 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 13 | if match: 14 | return match.group(1).lower() == "true" 15 | return None 16 | 17 | async def eval_saucy(): 18 | game = "Saucy Devil Gordon" 19 | 20 | with open("milestone_prompts.json", "r") as f: 21 | all_data = json.load(f) 22 | 23 | data = all_data.get(game, {}) 24 | if not data: 25 | print(f"❌ No data found for game: {game}") 26 | return 27 | 28 | result_obj = { 29 | "game": "saucy_devil_gordon", 30 | "results": {}, 31 | "failed_at": None, 32 | "last_attempted": None 33 | } 34 | 35 | print(f"🔍 Running evaluation for: {game}") 36 | 37 | milestones = [ 38 | ("milestone_prompt1", "pick_coconut", "🥥 Coconut Picked"), 39 | ("milestone_prompt2", "pick_pineapple", "🍍 Pineapple Picked"), 40 | ("milestone_prompt3", "door_open", "🚪 Door is Open"), 41 | ("milestone_prompt4", "rock_light", "💡 Rock Light"), 42 | ] 43 | 44 | try: 45 | for key, result_key, label in milestones: 46 | result_obj["last_attempted"] = key 47 | print(f"\n🏁 {key}") 48 | 49 | if key not in data: 50 | print(f"⚠️ Skipping {key} (not in prompt data)") 51 | continue 52 | 53 | convo = await run_milestone(data[key], f"{game}_{key}") 54 | result = extract_boolean_result(convo) 55 | result_obj["results"][result_key] = result 56 | 57 | print(f"{label}: {result}") 58 | 59 | if result is not True: 60 | print(f"🛑 {label} failed. Stopping evaluation.") 61 | result_obj["failed_at"] = key 62 | return 63 | 64 | print("🎉 All milestones passed successfully!") 65 | 66 | except KeyboardInterrupt: 67 | print("\n⚠️ Evaluation interrupted by user.") 68 | 69 | finally: 70 | Path("results").mkdir(exist_ok=True) 71 | with open("results/result_saucy.json", "w") as f: 72 | json.dump(result_obj, f, indent=2) 73 | print("📝 Result saved to results/result_saucy.json") 74 | 75 | if __name__ == "__main__": 76 | asyncio.run(eval_saucy()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_design.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_design(): 17 | game = "Design House Escape" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "design_house_escape", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | milestones = [ 37 | ("milestone_prompt1", "cube_exists", "🧊 Cube Exists"), 38 | ("milestone_prompt2", "1st_door_open", "🚪 1st Door Open"), 39 | ("milestone_prompt3", "2nd_door_open", "🚪 2nd Door Open"), 40 | ("milestone_prompt4", "3rd_door_open", "🚪 3rd Door Open"), 41 | ] 42 | 43 | try: 44 | for key, result_key, label in milestones: 45 | result_obj["last_attempted"] = key 46 | print(f"\n🏁 {key}") 47 | 48 | if key not in data: 49 | print(f"⚠️ Skipping {key} (not found in prompt data)") 50 | continue 51 | 52 | convo = await run_milestone(data[key], f"{game}_{key}") 53 | result = extract_boolean_result(convo) 54 | result_obj["results"][result_key] = result 55 | 56 | print(f"{label}: {result}") 57 | 58 | if result is not True: 59 | print(f"🛑 {label} failed. Stopping evaluation.") 60 | result_obj["failed_at"] = key 61 | return 62 | 63 | print("🎉 All milestones passed successfully!") 64 | 65 | except KeyboardInterrupt: 66 | print("\n⚠️ Evaluation interrupted by user.") 67 | 68 | finally: 69 | Path("results").mkdir(exist_ok=True) 70 | with open("results/result_design.json", "w") as f: 71 | json.dump(result_obj, f, indent=2) 72 | print("📝 Result saved to results/result_design.json") 73 | 74 | if __name__ == "__main__": 75 | asyncio.run(eval_design()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_vortex3.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_vortex3(): 17 | game = "Vortex Point3" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "vortex_point3", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | # milestone 37 | milestones = [ 38 | ("milestone_prompt1", "eat_hamburger", "Man eating hamburger"), 39 | ("milestone_prompt2", "toilet_open", "Toilet Open"), 40 | ("milestone_prompt3", "shopkeeper_gone", "Shop Keeper Gone"), 41 | ("milestone_prompt4", "ride_boat", "Ride Boat") 42 | ] 43 | 44 | try: 45 | for key, result_key, label in milestones: 46 | result_obj["last_attempted"] = key 47 | print(f"\n🏁 {key}") 48 | 49 | if key not in data: 50 | print(f"⚠️ Skipping {key} (not present in prompt data)") 51 | continue 52 | 53 | convo = await run_milestone(data[key], f"{game}_{key}") 54 | result = extract_boolean_result(convo) 55 | result_obj["results"][result_key] = result 56 | 57 | print(f"{label}: {result}") 58 | 59 | if result is not True: 60 | print(f"🛑 {label} failed. Stopping evaluation.") 61 | result_obj["failed_at"] = key 62 | return 63 | 64 | print("🎉 All milestones passed successfully!") 65 | 66 | except KeyboardInterrupt: 67 | print("\n⚠️ Evaluation interrupted by user.") 68 | 69 | finally: 70 | Path("results").mkdir(exist_ok=True) 71 | with open("results/result_vortex3.json", "w") as f: 72 | json.dump(result_obj, f, indent=2) 73 | print("📝 Result saved to results/result_vortex3.json") 74 | 75 | if __name__ == "__main__": 76 | asyncio.run(eval_vortex3()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_mirror.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_mirror(): 17 | game = "Mirror Room Escape" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "mirror_room_escape", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | milestones = [ 37 | ("milestone_prompt1", "colorful_door_open", "🎨 Colorful Door Open"), 38 | ("milestone_prompt2", "cabinet_mirror_open", "🪞 Cabinet Mirror Open"), 39 | ("milestone_prompt3", "display_show", "🖥️ Display Shown"), 40 | ("milestone_prompt4", "2nd_door_open", "🚪 2nd Door Open"), 41 | ] 42 | 43 | try: 44 | for key, result_key, label in milestones: 45 | result_obj["last_attempted"] = key 46 | print(f"\n🏁 {key}") 47 | 48 | if key not in data: 49 | print(f"⚠️ Skipping {key} (not found in prompt data)") 50 | continue 51 | 52 | convo = await run_milestone(data[key], f"{game}_{key}") 53 | result = extract_boolean_result(convo) 54 | result_obj["results"][result_key] = result 55 | 56 | print(f"{label}: {result}") 57 | 58 | if result is not True: 59 | print(f"🛑 {label} failed. Stopping evaluation.") 60 | result_obj["failed_at"] = key 61 | return 62 | 63 | print("🎉 All milestones passed successfully!") 64 | 65 | except KeyboardInterrupt: 66 | print("\n⚠️ Evaluation interrupted by user.") 67 | 68 | finally: 69 | Path("results").mkdir(exist_ok=True) 70 | with open("results/result_mirror.json", "w") as f: 71 | json.dump(result_obj, f, indent=2) 72 | print("📝 Result saved to results/result_mirror.json") 73 | 74 | if __name__ == "__main__": 75 | asyncio.run(eval_mirror()) -------------------------------------------------------------------------------- /game_agent/cradle/tools/load_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def save_chat_log(entry, game_name, api_model, cua): 6 | """ 7 | Save game action logs to a JSON file. 8 | Path: json/{api_model}/{game_name}/{cua}/game_log.json 9 | """ 10 | log_dir = os.path.join("json", api_model, game_name, cua) 11 | os.makedirs(log_dir, exist_ok=True) 12 | 13 | log_file = os.path.join(log_dir, "game_log.json") 14 | 15 | logs = [] 16 | if os.path.exists(log_file): 17 | with open(log_file, "r", encoding="utf-8") as f: 18 | try: 19 | logs = json.load(f) 20 | except json.JSONDecodeError: 21 | logs = [] 22 | 23 | logs.append(entry) 24 | with open(log_file, "w", encoding="utf-8") as f: 25 | json.dump(logs, f, ensure_ascii=False, indent=4) 26 | 27 | 28 | def load_game_prompt(game_name): 29 | """ Load the prompt and control keys for a specific game from JSON """ 30 | json_path = "./json/game_prompts.json" 31 | with open(json_path, "r", encoding="utf-8") as f: 32 | game_data = json.load(f) 33 | 34 | if game_name in game_data: 35 | g = game_data[game_name] 36 | return g["prompt"] 37 | else: 38 | raise ValueError(f"No prompt found for game '{game_name}'.") 39 | 40 | 41 | def load_system_prompt(game_name): 42 | """ Load the system prompt for a specific game from JSON """ 43 | json_path = "./json/game_prompts.json" 44 | with open(json_path, "r", encoding="utf-8") as f: 45 | game_data = json.load(f) 46 | 47 | if game_name in game_data: 48 | g = game_data[game_name] 49 | return g["system_prompt"] 50 | else: 51 | raise ValueError(f"No system prompt found for game '{game_name}'.") 52 | 53 | 54 | def load_memory_prompt(game_name, memory_type="task", path_map=None): 55 | """ 56 | Load memory files from JSON depending on the memory type. 57 | memory_type: "task" | "skill" 58 | path_map: optional dict to customize paths 59 | """ 60 | default_paths = { 61 | "task": f"./{game_name}/task_memory.json", 62 | "skill": f"./{game_name}/skills.json" 63 | } 64 | 65 | if path_map: 66 | default_paths.update(path_map) 67 | if memory_type not in default_paths: 68 | raise ValueError("memory_type must be either 'task' or 'skill'.") 69 | 70 | json_path = default_paths[memory_type] 71 | 72 | if not os.path.exists(json_path): 73 | return {} if memory_type == "skill" else [] 74 | with open(json_path, "r", encoding="utf-8") as f: 75 | return json.load(f) -------------------------------------------------------------------------------- /game_agent/UI-Tars/tools/load_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def save_chat_log(entry, game_name, api_model, cua): 6 | """ 7 | Saves game action logs to a JSON file. 8 | Path: json/{api_model}/{game_name}/{cua}/game_log.json 9 | """ 10 | log_dir = os.path.join("json", api_model, game_name, cua) 11 | os.makedirs(log_dir, exist_ok=True) 12 | 13 | log_file = os.path.join(log_dir, "game_log.json") 14 | 15 | logs = [] 16 | if os.path.exists(log_file): 17 | with open(log_file, "r", encoding="utf-8") as f: 18 | try: 19 | logs = json.load(f) 20 | except json.JSONDecodeError: 21 | logs = [] 22 | 23 | logs.append(entry) 24 | with open(log_file, "w", encoding="utf-8") as f: 25 | json.dump(logs, f, ensure_ascii=False, indent=4) 26 | 27 | 28 | def load_game_prompt(game_name): 29 | """ Loads the prompt and action keys for a specific game from a JSON file """ 30 | json_path = "./json/game_prompts.json" 31 | with open(json_path, "r", encoding="utf-8") as f: 32 | game_data = json.load(f) 33 | 34 | if game_name in game_data: 35 | g = game_data[game_name] 36 | return g["prompt"] 37 | else: 38 | raise ValueError(f"No prompt exists for game '{game_name}'.") 39 | 40 | def load_system_prompt(game_name): 41 | """ Loads the prompt and action keys for a specific game from a JSON file """ 42 | json_path = "./json/game_prompts.json" 43 | with open(json_path, "r", encoding="utf-8") as f: 44 | game_data = json.load(f) 45 | 46 | if game_name in game_data: 47 | g = game_data[game_name] 48 | return g["system_prompt"] 49 | else: 50 | raise ValueError(f"No system prompt exists for game '{game_name}'.") 51 | 52 | def load_memory_prompt(game_name, memory_type="task", path_map=None): 53 | """ 54 | Loads the JSON memory file based on memory type. 55 | memory_type: "task" | "skill" 56 | path_map: Optional dict to customize the path 57 | """ 58 | default_paths = { 59 | "task": f"./{game_name}/task_memory.json", 60 | "skill": f"./{game_name}/skills.json" 61 | } 62 | 63 | if path_map: 64 | default_paths.update(path_map) 65 | if memory_type not in default_paths: 66 | raise ValueError("memory_type must be 'task' or 'skill'.") 67 | 68 | json_path = default_paths[memory_type] 69 | 70 | if not os.path.exists(json_path): 71 | return {} if memory_type == "skill" else [] 72 | with open(json_path, "r", encoding="utf-8") as f: 73 | return json.load(f) -------------------------------------------------------------------------------- /evaluator/judge/vlm/tools/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | import re 4 | 5 | def encode_image(image_path): 6 | """ 7 | Read a file from disk and return its contents as a base64-encoded string. 8 | """ 9 | with open(image_path, "rb") as image_file: 10 | return base64.b64encode(image_file.read()).decode("utf-8") 11 | 12 | def log_output(thread_id, log_text, game): 13 | """ 14 | Logs output to `cache/thread_{thread_id}/output.log` 15 | """ 16 | thread_folder = f"cache/{game}/thread_{thread_id}" 17 | os.makedirs(thread_folder, exist_ok=True) 18 | 19 | log_path = os.path.join(thread_folder, "output.log") 20 | with open(log_path, "w", encoding="utf-8") as log_file: 21 | log_file.write(log_text + "\n\n") 22 | 23 | def extract_python_code(content): 24 | if not content: 25 | print("[ERROR] extract_python_code() received empty content") 26 | return "", "" 27 | 28 | print(f"[DEBUG] Raw content received:\n{content}\n") 29 | 30 | # 🔹 "code" 키 다음에 나오는 Python 코드 부분만 추출 31 | match = re.search(r'"code"\s*:\s*"""\s*(.*?)\s*"""', content, re.DOTALL) 32 | action_match = re.search(r'"action":\s*"([^"]+)"', content) # 🔹 action 값 찾기 33 | 34 | action_text = action_match.group(1) if action_match else "" # 🔹 문자열 값만 추출 35 | 36 | if match: 37 | code_content = match.group(1) # Python 코드 부분만 가져오기 38 | 39 | # 🔹 주석 제거 (멀티라인 """ """ 주석 & 단일 줄 # 주석) 40 | code_content = re.sub(r'""".*?"""', '', code_content, flags=re.DOTALL).strip() 41 | code_content = re.sub(r'^\s*#.*$', '', code_content, flags=re.MULTILINE).strip() 42 | 43 | print(f"[DEBUG] Extracted Code:\n{code_content}\n") 44 | print(f"[DEBUG] Extracted Action:\n{action_text}\n") # 🔹 action 값 출력 45 | return code_content, action_text 46 | 47 | print("[ERROR] No Python code found in content.") 48 | return "", action_text # 🔹 항상 action_text도 반환 49 | 50 | ### action 후에 화면 변화했는지 찾기 51 | def extract_action_change(content): # content = "reason: ... Success_Action: True" 52 | if not content: 53 | print("empty content") 54 | return "" 55 | 56 | print(f"[DEBUG] Raw content received:\n{content}\n") 57 | 58 | # "Success_Action: True" 또는 "Success_Action: False" 찾기 59 | match = re.search(r"Success_Action:\s*(True|False)", content, re.IGNORECASE) 60 | 61 | if match: 62 | result_success = match.group(1).lower() == "true" # 문자열을 boolean으로 변환 63 | return result_success # True 또는 False 반환 64 | 65 | print("[WARNING] Success_Action not found in content.") 66 | return "" 67 | -------------------------------------------------------------------------------- /game_agent/UI-Tars/desktop_env/desktop_env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import Tuple, Dict, Any, Optional 4 | 5 | from desktop_env.controllers.python import LocalController 6 | logger = logging.getLogger("desktopenv.env") 7 | 8 | 9 | class LocalDesktopEnv: 10 | def __init__( 11 | self, 12 | game_name, 13 | action_space: str = "pyautogui", 14 | screen_size: Tuple[int, int] = (1920, 1080), 15 | require_a11y_tree: bool = False, 16 | require_terminal: bool = False, 17 | ): 18 | assert action_space == "pyautogui", "Only 'pyautogui' is supported in local mode." 19 | 20 | self.action_space = action_space 21 | self.screen_size = screen_size 22 | self.require_a11y_tree = require_a11y_tree 23 | self.require_terminal = require_terminal 24 | 25 | self.instruction = "" 26 | self.task_id = "default_task" 27 | self.action_history = [] 28 | 29 | self.controller = LocalController("./screehshots", game_name) 30 | 31 | def reset(self, task_config: Optional[Dict[str, Any]] = None, **kwargs) -> Dict[str, Any]: 32 | self.action_history.clear() 33 | self.task_id = task_config.get("id", "default_task") if task_config else "default_task" 34 | self.instruction = task_config.get("instruction", "") if task_config else "" 35 | return self._get_obs() 36 | 37 | def _get_obs(self): 38 | return { 39 | "screenshot": self.controller.get_screenshot(), 40 | "accessibility_tree": None, 41 | "terminal": None, 42 | "instruction": self.instruction, 43 | } 44 | 45 | def step(self, action: str, pause: float = 1.0): 46 | self.action_history.append(action) 47 | 48 | done = False 49 | info = {} 50 | 51 | if action == "WAIT": 52 | time.sleep(pause) 53 | elif action == "FAIL": 54 | done = True 55 | info["fail"] = True 56 | elif action == "DONE": 57 | done = True 58 | info["done"] = True 59 | else: 60 | self.controller.execute_python_command(action) 61 | 62 | time.sleep(pause) 63 | obs = self._get_obs() 64 | return obs, 0.0, done, info 65 | 66 | def evaluate(self): 67 | return 0.0 68 | 69 | def close(self): 70 | logger.info("LocalDesktopEnv closed.") 71 | 72 | def render(self, mode="rgb_array"): 73 | if mode == "rgb_array": 74 | return self.controller.get_screenshot() 75 | raise ValueError(f"Unsupported render mode: {mode}") -------------------------------------------------------------------------------- /evaluator/eval_game/eval_ray2.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_ray2(): 17 | game = "Ray and Cooper 2" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "ray and cooper 2", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | milestones = [ 37 | ("milestone_prompt1", "vent_open", "🌀 Vent Open"), 38 | ("milestone_prompt2", "chef_disappear", "👨‍🍳 Chef Disappeared"), 39 | ("milestone_prompt3", "door_open", "🚪 Door Open"), 40 | ("milestone_prompt4", "attendant_disappear", "🧍‍♂️ Attendant Disappeared"), 41 | ("milestone_prompt5", "woman_disappear", "👩 Woman Disappeared"), 42 | ] 43 | 44 | try: 45 | for key, result_key, label in milestones: 46 | result_obj["last_attempted"] = key 47 | print(f"\n🏁 {key}") 48 | 49 | if key not in data: 50 | print(f"⚠️ Skipping {key} (not found in prompt data)") 51 | continue 52 | 53 | convo = await run_milestone(data[key], f"{game}_{key}") 54 | result = extract_boolean_result(convo) 55 | result_obj["results"][result_key] = result 56 | 57 | print(f"{label}: {result}") 58 | 59 | if result is not True: 60 | print(f"🛑 {label} failed. Stopping evaluation.") 61 | result_obj["failed_at"] = key 62 | return 63 | 64 | print("🎉 All milestones passed successfully!") 65 | 66 | except KeyboardInterrupt: 67 | print("\n⚠️ Evaluation interrupted by user.") 68 | 69 | finally: 70 | Path("results").mkdir(exist_ok=True) 71 | with open("results/result_ray2.json", "w") as f: 72 | json.dump(result_obj, f, indent=2) 73 | print("📝 Result saved to results/result_ray2.json") 74 | 75 | if __name__ == "__main__": 76 | asyncio.run(eval_ray2()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_pierre.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_pierre(): 17 | game = "Pierre Hotel" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "pierre hotel", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | milestones = [ 37 | ("milestone_prompt1", "bartendor_on_the_phone", "📞 Bartender on the phone"), 38 | ("milestone_prompt2", "broom_closet_door_open", "🚪 Broom Closet Door Open"), 39 | ("milestone_prompt3", "fireplace_out", "🔥 Fireplace Out"), 40 | ("milestone_prompt4", "elevator_works", "🛗 Elevator Works"), 41 | ("milestone_prompt5", "vampire_gone", "🧛 Vampire Gone"), 42 | ] 43 | 44 | try: 45 | for key, result_key, label in milestones: 46 | result_obj["last_attempted"] = key 47 | print(f"\n🏁 {key}") 48 | 49 | if key not in data: 50 | print(f"⚠️ Skipping {key} (not present in prompt data)") 51 | continue 52 | 53 | convo = await run_milestone(data[key], f"{game}_{key}") 54 | result = extract_boolean_result(convo) 55 | result_obj["results"][result_key] = result 56 | 57 | print(f"{label}: {result}") 58 | 59 | if result is not True: 60 | print(f"🛑 {label} failed. Stopping evaluation.") 61 | result_obj["failed_at"] = key 62 | return 63 | 64 | print("🎉 All milestones passed successfully!") 65 | 66 | except KeyboardInterrupt: 67 | print("\n⚠️ Evaluation interrupted by user.") 68 | 69 | finally: 70 | Path("results").mkdir(exist_ok=True) 71 | with open("results/result_pierre.json", "w") as f: 72 | json.dump(result_obj, f, indent=2) 73 | print("📝 Result saved to results/result_pierre.json") 74 | 75 | if __name__ == "__main__": 76 | asyncio.run(eval_pierre()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_dakota.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_boolean_result(conversation: List[str]) -> bool | None: 10 | for msg in reversed(conversation): 11 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 12 | if match: 13 | return match.group(1).lower() == "true" 14 | return None 15 | 16 | async def eval_dakota(): 17 | game = "Dakota Winchester's Adventures" 18 | 19 | with open("milestone_prompts.json", "r") as f: 20 | all_data = json.load(f) 21 | 22 | data = all_data.get(game, {}) 23 | if not data: 24 | print(f"❌ No data found for game: {game}") 25 | return 26 | 27 | result_obj = { 28 | "game": "dakota winchester's adventures", 29 | "results": {}, 30 | "failed_at": None, 31 | "last_attempted": None 32 | } 33 | 34 | print(f"🔍 Running evaluation for: {game}") 35 | 36 | milestones = [ 37 | ("milestone_prompt1", "stones_connected", "🪨 Stepping Stones Connected"), 38 | ("milestone_prompt2", "fire_burns", "🔥 Fire Burns"), 39 | ("milestone_prompt3", "temple_open", "🏛️ Temple Open"), 40 | ("milestone_prompt4", "monkey_banana", "🐒 Monkey Eats Banana"), 41 | ("milestone_prompt5", "lights_illuminate", "💡 Lights Illuminate"), 42 | ] 43 | 44 | try: 45 | for key, result_key, label in milestones: 46 | result_obj["last_attempted"] = key 47 | print(f"\n🏁 {key}") 48 | 49 | if key not in data: 50 | print(f"⚠️ Skipping {key} (not found in prompt data)") 51 | continue 52 | 53 | convo = await run_milestone(data[key], f"{game}_{key}") 54 | result = extract_boolean_result(convo) 55 | result_obj["results"][result_key] = result 56 | 57 | print(f"{label}: {result}") 58 | 59 | if result is not True: 60 | print(f"🛑 {label} failed. Stopping evaluation.") 61 | result_obj["failed_at"] = key 62 | return 63 | 64 | print("🎉 All milestones passed successfully!") 65 | 66 | except KeyboardInterrupt: 67 | print("\n⚠️ Evaluation interrupted by user.") 68 | 69 | finally: 70 | Path("results").mkdir(exist_ok=True) 71 | with open("results/result_dakota.json", "w") as f: 72 | json.dump(result_obj, f, indent=2) 73 | print("📝 Result saved to results/result_dakota.json") 74 | 75 | if __name__ == "__main__": 76 | asyncio.run(eval_dakota()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_vortex.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_found_places(conversation: List[str]) -> int: 10 | for msg in reversed(conversation): 11 | match = re.search(r"#Found Place:\s*(\d+)", msg, re.IGNORECASE) 12 | if match: 13 | return int(match.group(1)) 14 | return -1 15 | 16 | def extract_boolean_result(conversation: List[str]) -> bool | None: 17 | for msg in reversed(conversation): 18 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 19 | if match: 20 | return match.group(1).lower() == "true" 21 | return None 22 | 23 | async def eval_vortex(): 24 | game = "Vortex Point1" 25 | 26 | with open("milestone_prompts.json", "r") as f: 27 | all_data = json.load(f) 28 | 29 | data = all_data.get(game, {}) 30 | if not data: 31 | print(f"❌ No data found for game: {game}") 32 | return 33 | 34 | result_obj = { 35 | "game": "vortex_point1", 36 | "results": {} 37 | } 38 | 39 | print(f"🔍 Running evaluation for: {game}") 40 | 41 | # === milestone_prompt1 === 42 | key = "milestone_prompt1" 43 | print(f"\n🏁 {key}") 44 | convo1 = await run_milestone(data[key], f"{game}_{key}") 45 | places = extract_found_places(convo1) 46 | 47 | result_obj["results"]["found_places"] = places 48 | print(f"📌 Found Places: {places}") 49 | 50 | if places != 8: 51 | print("🛑 Found Places is not exactly 8. Stopping evaluation.") 52 | return 53 | 54 | # === milestone_prompt2 === 55 | key = "milestone_prompt2" 56 | print(f"\n🏁 {key}") 57 | convo2 = await run_milestone(data[key], f"{game}_{key}") 58 | result2 = extract_boolean_result(convo2) 59 | 60 | result_obj["results"]["door_2956_open"] = result2 61 | print(f"🚪 2956 Vineyard Drive Door Open: {result2}") 62 | 63 | if result2 is not True: 64 | print("🛑 Door is not open. Stopping evaluation.") 65 | return 66 | 67 | # === milestone_prompt3 === 68 | key = "milestone_prompt3" 69 | print(f"\n🏁 {key}") 70 | convo3 = await run_milestone(data[key], f"{game}_{key}") 71 | result3 = extract_boolean_result(convo3) 72 | 73 | result_obj["results"]["wing_c_open"] = result3 74 | print(f"🚪 Wing C Door Open: {result3}") 75 | 76 | Path("results").mkdir(exist_ok=True) 77 | with open("results/result_vortex.json", "w") as f: 78 | json.dump(result_obj, f, indent=2) 79 | print("📝 Result saved to results/result_vortex.json") 80 | 81 | if __name__ == "__main__": 82 | asyncio.run(eval_vortex()) -------------------------------------------------------------------------------- /game_agent/UI-Tars/desktop_env/controllers/python.py: -------------------------------------------------------------------------------- 1 | import pyautogui 2 | import time 3 | from mss import mss 4 | from PIL import Image 5 | from datetime import datetime 6 | import os 7 | 8 | 9 | class LocalController: 10 | def __init__(self, screenshot_dir="screenshots", game_name="None"): 11 | self.screenshot_dir = f"{screenshot_dir}/{game_name}" 12 | 13 | if not os.path.exists(self.screenshot_dir): 14 | os.makedirs(self.screenshot_dir) 15 | 16 | def get_screenshot(self) -> str: 17 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 18 | path = os.path.join(self.screenshot_dir, f"screenshot_{timestamp}.png") 19 | 20 | with mss() as sct: 21 | monitor = sct.monitors[1] 22 | sct_img = sct.grab(monitor) 23 | img = Image.frombytes("RGB", sct_img.size, sct_img.rgb) 24 | img.save(path) 25 | 26 | return path 27 | 28 | def execute_python_command(self, command: str): 29 | exec(command) 30 | 31 | def execute_action(self, action): 32 | if isinstance(action, str): 33 | if action == "WAIT": 34 | time.sleep(2) 35 | elif action in ["DONE", "FAIL"]: 36 | return 37 | else: 38 | exec(action) 39 | elif isinstance(action, dict): 40 | action_type = action.get("action_type") 41 | params = action.get("parameters", {}) 42 | 43 | if action_type == "MOVE_TO": 44 | pyautogui.moveTo(params.get("x", 0), params.get("y", 0)) 45 | elif action_type == "CLICK": 46 | pyautogui.click(params.get("x", None), params.get("y", None), button=params.get("button", "left")) 47 | elif action_type == "DOUBLE_CLICK": 48 | pyautogui.doubleClick() 49 | elif action_type == "TYPING": 50 | pyautogui.typewrite(params.get("text", "")) 51 | elif action_type == "KEY_DOWN": 52 | pyautogui.keyDown(params.get("key", "")) 53 | elif action_type == "KEY_UP": 54 | pyautogui.keyUp(params.get("key", "")) 55 | elif action_type == "HOTKEY": 56 | keys = params.get("keys", []) 57 | pyautogui.hotkey(*keys) 58 | elif action_type == "WAIT": 59 | time.sleep(2) 60 | elif action_type in ["DONE", "FAIL"]: 61 | pass 62 | else: 63 | print(f"Unknown action type: {action_type}") 64 | else: 65 | print("Invalid action format") 66 | 67 | def start_recording(self): 68 | print("[Recording start - no-op for local]") 69 | 70 | def end_recording(self, path): 71 | print(f"[Recording end - no-op for local] Saved to {path}") -------------------------------------------------------------------------------- /game_agent/UI-Tars/lib_run_single.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import logging 4 | import os 5 | import time 6 | 7 | logger = logging.getLogger("desktopenv.experiment") 8 | 9 | def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores): 10 | runtime_logger = setup_logger(example, example_result_dir) 11 | agent.reset(runtime_logger) 12 | env.reset(task_config=example) 13 | time.sleep(3) # Shorter wait for local environment 14 | obs = env._get_obs() 15 | done = False 16 | step_idx = 0 17 | 18 | while not done and step_idx < max_steps: 19 | response, actions = agent.predict(instruction, obs) 20 | 21 | for action in actions: 22 | action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") 23 | logger.info("Step %d: %s", step_idx + 1, action) 24 | 25 | obs, reward, done, info = env.step(action, args.sleep_after_execution) 26 | 27 | logger.info("Reward: %.2f", reward) 28 | logger.info("Done: %s", done) 29 | 30 | # Save screenshot 31 | screenshot_filename = f"step_{step_idx + 1}_{action_timestamp}.png" 32 | screenshot_path = os.path.join(example_result_dir, screenshot_filename) 33 | if hasattr(obs['screenshot'], "save"): 34 | obs['screenshot'].save(screenshot_path) 35 | else: 36 | logger.warning("Screenshot object has no .save() method — skipping save.") 37 | 38 | # Log trajectory 39 | traj_path = os.path.join(example_result_dir, "traj.jsonl") 40 | with open(traj_path, "a", encoding="utf-8") as f: 41 | f.write(json.dumps({ 42 | "step_num": step_idx + 1, 43 | "action_timestamp": action_timestamp, 44 | "action": action, 45 | "reward": reward, 46 | "done": done, 47 | "info": info, 48 | "screenshot_file": screenshot_filename 49 | }) + "\n") 50 | 51 | if done: 52 | logger.info("The episode is done.") 53 | break 54 | 55 | step_idx += 1 56 | 57 | # Dummy evaluation for local 58 | result = env.evaluate() 59 | logger.info("Result: %.2f", result) 60 | scores.append(result) 61 | 62 | result_path = os.path.join(example_result_dir, "result.txt") 63 | with open(result_path, "w", encoding="utf-8") as f: 64 | f.write(f"{result}\n") 65 | 66 | 67 | def setup_logger(example, example_result_dir): 68 | runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}") 69 | runtime_logger.setLevel(logging.DEBUG) 70 | log_file = os.path.join(example_result_dir, "runtime.log") 71 | runtime_logger.addHandler(logging.FileHandler(log_file)) 72 | return runtime_logger -------------------------------------------------------------------------------- /game_agent/coast/tools/load_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import yaml 4 | 5 | 6 | 7 | def save_chat_log(entry, game_name, api_model, cua): 8 | """ 9 | Saves game action logs to a JSON file. 10 | Path: json/{api_model}/{game_name}/{cua}/game_log.json 11 | """ 12 | log_dir = os.path.join("json", cua, api_model, game_name, ) 13 | os.makedirs(log_dir, exist_ok=True) 14 | 15 | log_file = os.path.join(log_dir, f"game_log_{game_name}.json") 16 | 17 | logs = [] 18 | if os.path.exists(log_file): 19 | with open(log_file, "r", encoding="utf-8") as f: 20 | try: 21 | logs = json.load(f) 22 | except json.JSONDecodeError: 23 | logs = [] 24 | 25 | logs.append(entry) 26 | with open(log_file, "w", encoding="utf-8") as f: 27 | json.dump(logs, f, ensure_ascii=False, indent=4) 28 | 29 | def load_config(yaml_path): 30 | """ 31 | Loads a YAML configuration file and returns it as a dictionary. 32 | """ 33 | with open(yaml_path, 'r', encoding='utf-8') as f: 34 | return yaml.safe_load(f) 35 | 36 | 37 | def load_action_prompt(json_path, moduler): 38 | """ Loads the prompt and action keys for a specific game from a JSON file """ 39 | with open(json_path, "r", encoding="utf-8") as f: 40 | json_path = json.load(f) 41 | 42 | if moduler in json_path: 43 | g = json_path[moduler] 44 | return g["action_prompt"] 45 | else: 46 | raise ValueError(f"No prompt exists for game '{moduler}'.") 47 | 48 | 49 | def load_game_prompt(json_path, game_name, type): 50 | """ Loads the prompt and action keys for a specific game from a JSON file """ 51 | with open(json_path, "r", encoding="utf-8") as f: 52 | game_data = json.load(f) 53 | 54 | if game_name in game_data: 55 | g = game_data[game_name] 56 | 57 | if type in ("game_prompt", "system_prompt"): 58 | return g[type] 59 | else: 60 | raise ValueError(f"No {type} exists for game '{game_name}'.") 61 | 62 | else: 63 | raise ValueError(f"No prompt exists for game '{game_name}'.") 64 | 65 | 66 | def load_memory(json_dir, type="episodic", n=None): 67 | """ 68 | type: one of 'episodic', 'clue', 'task', 'reflection' 69 | n: (Optional) If it's a list, only the last n items are returned 70 | Returns: 71 | - List (optionally sliced) for episodic/reflection memory 72 | - Dict for clue/task memory 73 | """ 74 | filename = f"{type}_memory.json" 75 | path = os.path.join(json_dir, filename) 76 | 77 | if not os.path.exists(path): 78 | return {} if type in ("task") else [] 79 | 80 | with open(path, "r", encoding="utf-8") as f: 81 | data = json.load(f) 82 | 83 | # Slice the last n items (for list types only) 84 | if isinstance(data, list) and n is not None: 85 | return data[-n:] 86 | return data -------------------------------------------------------------------------------- /evaluator/eval_game/eval_elevator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from eval_game.eval_utils import run_milestone 5 | 6 | def parse_continue_flag(msg: str) -> str: 7 | """ 8 | Returns: 9 | - "continue" if the message contains 'Continue: True' 10 | - "final" if the message contains 'Continue: Final' 11 | - "stop" otherwise (includes 'Continue: False' or missing) 12 | """ 13 | if not msg: 14 | return "stop" 15 | msg = msg.strip() 16 | if "Continue: True" in msg: 17 | return "continue" 18 | if "Continue: Final" in msg: 19 | return "final" 20 | return "stop" 21 | 22 | def extract_result(conversation: list[str]) -> str | None: 23 | """ 24 | Find the last line in the conversation that contains 'Result:'. 25 | Returns the full line, or None if not found. 26 | """ 27 | for msg in reversed(conversation): 28 | if "Result:" in msg: 29 | return msg 30 | return None 31 | 32 | def extract_final_stage(msg: str) -> str | None: 33 | """ 34 | Extract the numeric 'Final Stage' value from a result string. 35 | Example: 36 | 'Result: [Final Stage: 2, Continue: True]' -> '2' 37 | Returns None if not found. 38 | """ 39 | if not msg: 40 | return None 41 | m = re.search(r"Final Stage:\s*(\d+)", msg) 42 | return m.group(1) if m else None 43 | 44 | async def eval_elevator(): 45 | game = "Elevator Room Escape" 46 | 47 | # Load prompts 48 | with open("milestone_prompts.json", "r", encoding="utf-8") as f: 49 | all_data = json.load(f) 50 | 51 | data = all_data.get(game, {}) 52 | if not data: 53 | print(f"❌ No data found for game: {game}") 54 | return 55 | 56 | print(f"🎮 Starting evaluation for: {game}") 57 | 58 | # Run milestones 1..4 if present (works fine even if only 1..3 exist) 59 | for i in range(1, 4 + 1): 60 | prompt_key = f"milestone_prompt{i}" 61 | prompt = data.get(prompt_key) 62 | if not prompt: 63 | # Skip if this milestone is not defined 64 | continue 65 | 66 | print(f"\n🏁 {game} - {prompt_key}") 67 | conversation = await run_milestone(prompt, f"{game}_{prompt_key}") 68 | 69 | # Extract and print ONLY the Final Stage 70 | result_line = extract_result(conversation) 71 | final_stage = extract_final_stage(result_line or "") 72 | print(f"🎯 Final Stage: {final_stage if final_stage is not None else 'N/A'}") 73 | 74 | # Control flow based on Continue flag 75 | status = parse_continue_flag(result_line or "") 76 | if status == "continue": 77 | continue # proceed to next milestone 78 | elif status == "final": 79 | print(f"🏁 Final milestone reached at {prompt_key}. Evaluation ends.") 80 | return 81 | else: 82 | print(f"🛑 Evaluation stopped after {prompt_key}") 83 | return 84 | 85 | print("✅ All available milestones completed.") 86 | 87 | if __name__ == "__main__": 88 | asyncio.run(eval_elevator()) -------------------------------------------------------------------------------- /evaluator/eval_game/eval_space.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List, Optional, Tuple 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | 10 | COLOR_LINE_RE = re.compile(r"Color\s*:\s*([A-Za-z\- ]+)", re.IGNORECASE) 11 | 12 | CANONICAL_COLORS = { 13 | "red": "Red", 14 | "yellow": "Yellow", 15 | "green": "Green", 16 | "sky-blue": "Sky-Blue", 17 | "blue": "Blue", 18 | "pink": "Pink", 19 | } 20 | 21 | def normalize_color_name(raw: str) -> Tuple[str, Optional[str]]: 22 | 23 | raw_trimmed = (raw or "").strip() 24 | 25 | 26 | s = raw_trimmed.lower().strip() 27 | 28 | 29 | s = re.sub(r"\s+", " ", s) 30 | s = s.replace(" ", "-") 31 | 32 | s = re.sub(r"[^a-z\-]", "", s) 33 | 34 | canonical = CANONICAL_COLORS.get(s) 35 | return raw_trimmed, canonical 36 | 37 | def extract_color(conversation: List[str]) -> Optional[str]: 38 | 39 | for msg in reversed(conversation or []): 40 | m = COLOR_LINE_RE.search(msg) 41 | if m: 42 | raw = m.group(1) 43 | _raw_trimmed, canonical = normalize_color_name(raw) 44 | if canonical: 45 | return canonical 46 | 47 | return None 48 | 49 | # ========================= 50 | # Evalutator 51 | # ========================= 52 | async def eval_space(): 53 | game = "Space Museum Escape" 54 | 55 | # milestone_prompts.json 56 | with open("milestone_prompts.json", "r", encoding="utf-8") as f: 57 | all_data = json.load(f) 58 | 59 | data = all_data.get(game, {}) 60 | if not data: 61 | print(f"❌ No data found for game: {game}") 62 | return 63 | 64 | print(f"🎮 Running evaluation for: {game}") 65 | 66 | # 1) Instruction 67 | if "Instruction" in data: 68 | _ = await run_milestone(data["Instruction"], f"{game}_Instruction") 69 | 70 | # 2) milestone_prompt1 71 | prompt_key = "milestone_prompt1" 72 | if prompt_key not in data: 73 | print(f"❌ No milestone data found for {game} / {prompt_key}") 74 | return 75 | 76 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 77 | if conversation is None: 78 | print("⚠️ No conversation returned from agent.") 79 | return 80 | 81 | color = extract_color(conversation) 82 | 83 | if color is not None: 84 | print(f"Color: {color}") 85 | 86 | result_obj = { 87 | "game": "space_museum_escape", 88 | "result": f"color: {color}", 89 | "color": color, 90 | } 91 | 92 | Path("results").mkdir(exist_ok=True) 93 | with open("results/result_space_museum.json", "w", encoding="utf-8") as f: 94 | json.dump(result_obj, f, indent=2, ensure_ascii=False) 95 | print("📝 Result saved to results/result_space_museum.json") 96 | else: 97 | print("⚠️ Couldn't find Color in result.") 98 | 99 | if __name__ == "__main__": 100 | asyncio.run(eval_space()) -------------------------------------------------------------------------------- /game_agent/cradle/tools/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | import re 4 | 5 | def encode_image(image_path): 6 | """ 7 | Read a file from disk and return its contents as a base64-encoded string. 8 | """ 9 | with open(image_path, "rb") as image_file: 10 | return base64.b64encode(image_file.read()).decode("utf-8") 11 | 12 | 13 | def encode_images_to_base64(image_paths): 14 | encoded = [] 15 | for path in image_paths: 16 | try: 17 | with open(path, "rb") as f: 18 | encoded_str = base64.b64encode(f.read()).decode("utf-8") 19 | encoded.append(encoded_str) 20 | except Exception as e: 21 | print(f"[WARN] Failed to encode image {path}: {e}") 22 | return encoded 23 | 24 | def extract_python_code(content): 25 | if not content: 26 | print("[ERROR] extract_python_code() received empty content") 27 | return "", "" 28 | 29 | print(f"[DEBUG] Raw content received:\n{content}\n") 30 | 31 | # 🔹 Extract only the Python code that comes after the "code" key 32 | match = re.search(r'"code"\s*:\s*"""\s*(.*?)\s*"""', content, re.DOTALL) 33 | action_match = re.search(r'"action":\s*"([^"]+)"', content) # 🔹 Find action value 34 | 35 | action_text = action_match.group(1) if action_match else "" # 🔹 Extract string value only 36 | 37 | if match: 38 | code_content = match.group(1) # Get only the Python code section 39 | 40 | # 🔹 Remove comments (multiline """ """ and single-line # comments) 41 | code_content = re.sub(r'""".*?"""', '', code_content, flags=re.DOTALL).strip() 42 | code_content = re.sub(r'^\s*#.*$', '', code_content, flags=re.MULTILINE).strip() 43 | 44 | print(f"[DEBUG] Extracted Code:\n{code_content}\n") 45 | print(f"[DEBUG] Extracted Action:\n{action_text}\n") # 🔹 Print action value 46 | return code_content, action_text 47 | 48 | print("[ERROR] No Python code found in content.") 49 | return "", action_text # 🔹 Always return action_text as well 50 | 51 | 52 | def extract_action_change(content: str) -> dict: 53 | """ 54 | Extracts whether an action succeeded and the reason from a GPT response. 55 | 56 | Args: 57 | content (str): GPT response 58 | 59 | Returns: 60 | dict: { 61 | "success": True/False, 62 | "explanation": "Explanation of the change caused by the action" 63 | } 64 | """ 65 | if not content: 66 | print("⚠️ content is empty.") 67 | return {"success": False, "explanation": "No content"} 68 | 69 | print(f"[DEBUG] Raw GPT response:\n{content}\n") 70 | 71 | # Extract success status 72 | match = re.search(r"Success_Action:\s*(True|False)", content, re.IGNORECASE) 73 | success = match.group(1).lower() == "true" if match else False 74 | 75 | # Extract explanation (text following Reason:) 76 | explanation_match = re.search(r"Reason:\s*(.+)", content, re.IGNORECASE | re.DOTALL) 77 | explanation = explanation_match.group(1).strip() if explanation_match else "No explanation of change" 78 | 79 | return { 80 | "success": success, 81 | "explanation": explanation 82 | } -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/computers/computer_use.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import time 3 | import base64 4 | from typing import List, Dict, Literal 5 | from io import BytesIO 6 | from PIL import Image, ImageDraw, ImageFont 7 | import pyautogui 8 | from .computer import Computer 9 | 10 | class LocalDesktopComputer(Computer): 11 | def __init__(self, max_actions: int = 3): 12 | os_name = platform.system().lower() 13 | if "darwin" in os_name: 14 | self._environment = "mac" 15 | elif "linux" in os_name: 16 | self._environment = "linux" 17 | else: 18 | self._environment = "windows" 19 | self._dimensions = pyautogui.size() 20 | 21 | self._action_count = 0 22 | self._max_actions = max_actions 23 | self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"] 24 | 25 | @property 26 | def environment(self) -> Literal["windows", "mac", "linux"]: 27 | return self._environment 28 | 29 | @property 30 | def dimensions(self) -> tuple[int, int]: 31 | return self._dimensions 32 | 33 | @property 34 | def action_count(self) -> int: 35 | return self._action_count 36 | 37 | @property 38 | def max_actions(self) -> int: 39 | return self._max_actions 40 | 41 | def _maybe_count(self, action_name: str): 42 | if action_name in self._countable: 43 | self._action_count += 1 44 | print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}") 45 | 46 | def screenshot(self) -> str: 47 | img = pyautogui.screenshot() 48 | buffer = BytesIO() 49 | img.save(buffer, format="PNG") 50 | return base64.b64encode(buffer.getvalue()).decode("utf-8") 51 | 52 | def click(self, x: int, y: int, button: str = "left") -> None: 53 | self._maybe_count("click") 54 | pyautogui.click(x=x, y=y, button=button) 55 | 56 | def double_click(self, x: int, y: int) -> None: 57 | self._maybe_count("double_click") 58 | pyautogui.doubleClick(x=x, y=y) 59 | 60 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 61 | self._maybe_count("scroll") 62 | pyautogui.moveTo(x, y) 63 | pyautogui.scroll(scroll_y) 64 | 65 | def type(self, text: str) -> None: 66 | self._maybe_count("type") 67 | pyautogui.write(text) 68 | 69 | def wait(self, ms: int = 1000) -> None: 70 | time.sleep(ms / 1000) 71 | 72 | def move(self, x: int, y: int) -> None: 73 | pyautogui.moveTo(x, y) 74 | 75 | def keypress(self, keys: List[str]) -> None: 76 | self._maybe_count("keypress") 77 | for key in keys: 78 | pyautogui.keyDown(key) 79 | for key in reversed(keys): 80 | pyautogui.keyUp(key) 81 | 82 | def drag(self, path: List[Dict[str, int]]) -> None: 83 | self._maybe_count("drag") 84 | if not path: 85 | return 86 | pyautogui.moveTo(path[0]["x"], path[0]["y"]) 87 | pyautogui.mouseDown() 88 | for point in path[1:]: 89 | pyautogui.moveTo(point["x"], point["y"]) 90 | pyautogui.mouseUp() 91 | 92 | def get_current_url(self) -> str: 93 | return "file://local-desktop" 94 | 95 | def reset_action_counter(self): 96 | self._action_count = 0 -------------------------------------------------------------------------------- /evaluator/eval_game/eval_sherlock2.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import re 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from eval_game.eval_utils import run_milestone 8 | 9 | def extract_found_items(conversation: List[str]) -> int: 10 | for msg in reversed(conversation): 11 | match = re.search(r"#Found Items:\s*(\d+)", msg, re.IGNORECASE) 12 | if match: 13 | return int(match.group(1)) 14 | return -1 15 | 16 | def extract_boolean_result(conversation: List[str]) -> bool | None: 17 | for msg in reversed(conversation): 18 | match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE) 19 | if match: 20 | return match.group(1).lower() == "true" 21 | return None 22 | 23 | async def eval_sherlock2(): 24 | game = "Sherlock Holmes 2" 25 | 26 | with open("milestone_prompts.json", "r") as f: 27 | all_data = json.load(f) 28 | 29 | data = all_data.get(game, {}) 30 | if not data: 31 | print(f"❌ No data found for game: {game}") 32 | return 33 | 34 | result_obj = { 35 | "game": "sherlock_holmes_2", 36 | "results": {}, 37 | "failed_at": None 38 | } 39 | 40 | print(f"🔍 Running evaluation for: {game}") 41 | 42 | try: 43 | # === milestone_prompt1 (Always run, even if it fails) === 44 | key = "milestone_prompt1" 45 | print(f"\n🏁 {key}") 46 | convo1 = await run_milestone(data[key], f"{game}_{key}") 47 | found_items = extract_found_items(convo1) 48 | 49 | result_obj["results"]["found_items"] = found_items 50 | print(f"🧾 Found Items: {found_items}") 51 | 52 | # === milestone_prompt2 (fails => stop) === 53 | key = "milestone_prompt2" 54 | print(f"\n🏁 {key}") 55 | convo2 = await run_milestone(data[key], f"{game}_{key}") 56 | result2 = extract_boolean_result(convo2) 57 | 58 | result_obj["results"]["fire_alarm_open"] = result2 59 | print(f"🚨 Fire Alarm Open (2F): {result2}") 60 | 61 | if result2 is not True: 62 | result_obj["failed_at"] = key 63 | print(f"🛑 Failed at {key}: Fire alarm is not open.") 64 | return 65 | 66 | # === milestone_prompt3 (fails => stop) === 67 | key = "milestone_prompt3" 68 | print(f"\n🏁 {key}") 69 | convo3 = await run_milestone(data[key], f"{game}_{key}") 70 | result3 = extract_boolean_result(convo3) 71 | 72 | result_obj["results"]["leaves_burn"] = result3 73 | print(f"🔥 Leaves Burning (1F): {result3}") 74 | 75 | if result3 is not True: 76 | result_obj["failed_at"] = key 77 | print(f"🛑 Failed at {key}: Leaves are not burning.") 78 | return 79 | 80 | # ✅ All passed 81 | print("🎉 All milestones passed successfully!") 82 | 83 | except KeyboardInterrupt: 84 | print("\n⚠️ Evaluation interrupted by user.") 85 | 86 | finally: 87 | save_results(result_obj) 88 | 89 | def save_results(result_obj): 90 | Path("results").mkdir(exist_ok=True) 91 | with open("results/result_sherlock2.json", "w") as f: 92 | json.dump(result_obj, f, indent=2) 93 | print("📝 Result saved to results/result_sherlock2.json") 94 | 95 | if __name__ == "__main__": 96 | asyncio.run(eval_sherlock2()) -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/computers/computer_use.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import time 3 | import base64 4 | from typing import List, Dict, Literal 5 | from io import BytesIO 6 | from PIL import Image, ImageDraw, ImageFont 7 | import pyautogui 8 | from .computer import Computer 9 | 10 | class LocalDesktopComputer(Computer): 11 | def __init__(self, max_actions: int = 3): 12 | os_name = platform.system().lower() 13 | if "darwin" in os_name: 14 | self._environment = "mac" 15 | elif "linux" in os_name: 16 | self._environment = "linux" 17 | else: 18 | self._environment = "windows" 19 | self._dimensions = pyautogui.size() 20 | 21 | self._action_count = 0 22 | self._max_actions = max_actions 23 | self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"] 24 | 25 | @property 26 | def environment(self) -> Literal["windows", "mac", "linux"]: 27 | return self._environment 28 | 29 | @property 30 | def dimensions(self) -> tuple[int, int]: 31 | return self._dimensions 32 | 33 | @property 34 | def action_count(self) -> int: 35 | return self._action_count 36 | 37 | @property 38 | def max_actions(self) -> int: 39 | return self._max_actions 40 | 41 | def _maybe_count(self, action_name: str): 42 | if action_name in self._countable: 43 | self._action_count += 1 44 | print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}") 45 | 46 | def screenshot(self) -> str: 47 | img = pyautogui.screenshot() 48 | buffer = BytesIO() 49 | img.save(buffer, format="PNG") 50 | return base64.b64encode(buffer.getvalue()).decode("utf-8") 51 | 52 | def click(self, x: int, y: int, button: str = "left") -> None: 53 | self._maybe_count("click") 54 | pyautogui.click(x=x, y=y, button=button) 55 | 56 | def double_click(self, x: int, y: int) -> None: 57 | self._maybe_count("double_click") 58 | pyautogui.doubleClick(x=x, y=y) 59 | 60 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 61 | self._maybe_count("scroll") 62 | pyautogui.moveTo(x, y) 63 | pyautogui.scroll(scroll_y) 64 | 65 | def type(self, text: str) -> None: 66 | self._maybe_count("type") 67 | pyautogui.write(text) 68 | 69 | def wait(self, ms: int = 1000) -> None: 70 | time.sleep(ms / 1000) 71 | 72 | def move(self, x: int, y: int) -> None: 73 | pyautogui.moveTo(x, y) 74 | 75 | def keypress(self, keys: List[str]) -> None: 76 | self._maybe_count("keypress") 77 | for key in keys: 78 | pyautogui.keyDown(key) 79 | for key in reversed(keys): 80 | pyautogui.keyUp(key) 81 | 82 | def drag(self, path: List[Dict[str, int]]) -> None: 83 | self._maybe_count("drag") 84 | if not path: 85 | return 86 | pyautogui.moveTo(path[0]["x"], path[0]["y"]) 87 | pyautogui.mouseDown() 88 | for point in path[1:]: 89 | pyautogui.moveTo(point["x"], point["y"]) 90 | pyautogui.mouseUp() 91 | 92 | def get_current_url(self) -> str: 93 | return "file://local-desktop" 94 | 95 | def reset_action_counter(self): 96 | self._action_count = 0 -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gui_grounding/computer/computer_use.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import time 3 | import base64 4 | from typing import List, Dict, Literal 5 | from io import BytesIO 6 | from PIL import Image, ImageDraw, ImageFont 7 | import pyautogui 8 | from .computer import Computer 9 | 10 | class LocalDesktopComputer(Computer): 11 | def __init__(self, max_actions: int = 3): 12 | os_name = platform.system().lower() 13 | if "darwin" in os_name: 14 | self._environment = "mac" 15 | elif "linux" in os_name: 16 | self._environment = "linux" 17 | else: 18 | self._environment = "windows" 19 | self._dimensions = pyautogui.size() 20 | 21 | self._action_count = 0 22 | self._max_actions = max_actions 23 | self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"] 24 | 25 | @property 26 | def environment(self) -> Literal["windows", "mac", "linux"]: 27 | return self._environment 28 | 29 | @property 30 | def dimensions(self) -> tuple[int, int]: 31 | return self._dimensions 32 | 33 | @property 34 | def action_count(self) -> int: 35 | return self._action_count 36 | 37 | @property 38 | def max_actions(self) -> int: 39 | return self._max_actions 40 | 41 | def _maybe_count(self, action_name: str): 42 | if action_name in self._countable: 43 | self._action_count += 1 44 | print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}") 45 | 46 | def screenshot(self) -> str: 47 | img = pyautogui.screenshot() 48 | buffer = BytesIO() 49 | img.save(buffer, format="PNG") 50 | return base64.b64encode(buffer.getvalue()).decode("utf-8") 51 | 52 | def click(self, x: int, y: int, button: str = "left") -> None: 53 | self._maybe_count("click") 54 | pyautogui.click(x=x, y=y, button=button) 55 | 56 | def double_click(self, x: int, y: int) -> None: 57 | self._maybe_count("double_click") 58 | pyautogui.doubleClick(x=x, y=y) 59 | 60 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 61 | self._maybe_count("scroll") 62 | pyautogui.moveTo(x, y) 63 | pyautogui.scroll(scroll_y) 64 | 65 | def type(self, text: str) -> None: 66 | self._maybe_count("type") 67 | pyautogui.write(text) 68 | 69 | def wait(self, ms: int = 1000) -> None: 70 | time.sleep(ms / 1000) 71 | 72 | def move(self, x: int, y: int) -> None: 73 | pyautogui.moveTo(x, y) 74 | 75 | def keypress(self, keys: List[str]) -> None: 76 | self._maybe_count("keypress") 77 | for key in keys: 78 | pyautogui.keyDown(key) 79 | for key in reversed(keys): 80 | pyautogui.keyUp(key) 81 | 82 | def drag(self, path: List[Dict[str, int]]) -> None: 83 | self._maybe_count("drag") 84 | if not path: 85 | return 86 | pyautogui.moveTo(path[0]["x"], path[0]["y"]) 87 | pyautogui.mouseDown() 88 | for point in path[1:]: 89 | pyautogui.moveTo(point["x"], point["y"]) 90 | pyautogui.mouseUp() 91 | 92 | def get_current_url(self) -> str: 93 | return "file://local-desktop" 94 | 95 | def reset_action_counter(self): 96 | self._action_count = 0 97 | -------------------------------------------------------------------------------- /game_agent/cradle/gui_grounding/computer/computer_use.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import time 3 | import base64 4 | from typing import List, Dict, Literal 5 | from io import BytesIO 6 | from PIL import Image, ImageDraw, ImageFont 7 | import pyautogui 8 | from .computer import Computer 9 | 10 | class LocalDesktopComputer(Computer): 11 | def __init__(self, max_actions: int = 3): 12 | os_name = platform.system().lower() 13 | if "darwin" in os_name: 14 | self._environment = "mac" 15 | elif "linux" in os_name: 16 | self._environment = "linux" 17 | else: 18 | self._environment = "windows" 19 | self._dimensions = pyautogui.size() 20 | 21 | self._action_count = 0 22 | self._max_actions = max_actions 23 | self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"] 24 | 25 | @property 26 | def environment(self) -> Literal["windows", "mac", "linux"]: 27 | return self._environment 28 | 29 | @property 30 | def dimensions(self) -> tuple[int, int]: 31 | return self._dimensions 32 | 33 | @property 34 | def action_count(self) -> int: 35 | return self._action_count 36 | 37 | @property 38 | def max_actions(self) -> int: 39 | return self._max_actions 40 | 41 | def _maybe_count(self, action_name: str): 42 | if action_name in self._countable: 43 | self._action_count += 1 44 | print(f"⬆️ Add Action Counter: {self._action_count}/{self._max_actions}") 45 | 46 | def screenshot(self) -> str: 47 | img = pyautogui.screenshot() 48 | buffer = BytesIO() 49 | img.save(buffer, format="PNG") 50 | return base64.b64encode(buffer.getvalue()).decode("utf-8") 51 | 52 | def click(self, x: int, y: int, button: str = "left") -> None: 53 | self._maybe_count("click") 54 | pyautogui.click(x=x, y=y, button=button) 55 | 56 | def double_click(self, x: int, y: int) -> None: 57 | self._maybe_count("double_click") 58 | pyautogui.doubleClick(x=x, y=y) 59 | 60 | def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: 61 | self._maybe_count("scroll") 62 | pyautogui.moveTo(x, y) 63 | pyautogui.scroll(scroll_y) 64 | 65 | def type(self, text: str) -> None: 66 | self._maybe_count("type") 67 | pyautogui.write(text) 68 | 69 | def wait(self, ms: int = 1000) -> None: 70 | time.sleep(ms / 1000) 71 | 72 | def move(self, x: int, y: int) -> None: 73 | pyautogui.moveTo(x, y) 74 | 75 | def keypress(self, keys: List[str]) -> None: 76 | self._maybe_count("keypress") 77 | for key in keys: 78 | pyautogui.keyDown(key) 79 | for key in reversed(keys): 80 | pyautogui.keyUp(key) 81 | 82 | def drag(self, path: List[Dict[str, int]]) -> None: 83 | self._maybe_count("drag") 84 | if not path: 85 | return 86 | pyautogui.moveTo(path[0]["x"], path[0]["y"]) 87 | pyautogui.mouseDown() 88 | for point in path[1:]: 89 | pyautogui.moveTo(point["x"], point["y"]) 90 | pyautogui.mouseUp() 91 | 92 | def get_current_url(self) -> str: 93 | return "file://local-desktop" 94 | 95 | def reset_action_counter(self): 96 | self._action_count = 0 97 | -------------------------------------------------------------------------------- /game_agent/coast/gui_agent/gpt_cua/simple_cua_loop.py: -------------------------------------------------------------------------------- 1 | # simple_cua_loop.py or gpt_cua/runner.py 2 | from .computers import LocalDesktopComputer 3 | from .utils import create_response, check_blocklisted_url 4 | 5 | def acknowledge_safety_check_callback(message: str) -> bool: 6 | response = input(f"Safety Check Warning: {message}\nProceed? (y/n): ").lower() 7 | return response.strip() == "y" 8 | 9 | def handle_item(item, computer: LocalDesktopComputer): 10 | if item["type"] == "message": 11 | print(item["content"][0]["text"]) 12 | 13 | if item["type"] == "computer_call": 14 | action = item["action"] 15 | action_type = action["type"] 16 | action_args = {k: v for k, v in action.items() if k != "type"} 17 | print(f"🖱️ {action_type}({action_args})") 18 | 19 | getattr(computer, action_type)(**action_args) 20 | 21 | print(f"🎯 액션 카운트: {computer.action_count}/{computer.max_actions}") 22 | 23 | screenshot_base64 = computer.screenshot() 24 | 25 | checks = item.get("pending_safety_checks", []) 26 | for check in checks: 27 | if not acknowledge_safety_check_callback(check["message"]): 28 | raise ValueError(f"Safety check failed: {check['message']}") 29 | 30 | output = { 31 | "type": "computer_call_output", 32 | "call_id": item["call_id"], 33 | "acknowledged_safety_checks": checks, 34 | "output": { 35 | "type": "input_image", 36 | "image_url": f"data:image/png;base64,{screenshot_base64}", 37 | }, 38 | } 39 | 40 | if computer.environment == "browser": 41 | current_url = computer.get_current_url() 42 | output["output"]["current_url"] = current_url 43 | check_blocklisted_url(current_url) 44 | 45 | return [output] 46 | 47 | return [] 48 | 49 | import time 50 | 51 | def main_gpt_operator(user_prompt=None, max_retries=300): 52 | computer = LocalDesktopComputer(max_actions=300) 53 | tools = [{ 54 | "type": "computer-preview", 55 | "display_width": computer.dimensions[0], 56 | "display_height": computer.dimensions[1], 57 | "environment": computer.environment, 58 | }] 59 | 60 | items = [] 61 | if user_prompt: 62 | items.append({"role": "user", "content": user_prompt}) 63 | else: 64 | user_input = input("> ") 65 | items.append({"role": "user", "content": user_input}) 66 | 67 | while True: 68 | for attempt in range(max_retries): 69 | response = create_response( 70 | model="computer-use-preview", 71 | input=items, 72 | tools=tools, 73 | truncation="auto", 74 | ) 75 | 76 | if "output" in response: 77 | break 78 | else: 79 | print(f"[Retry {attempt+1}/{max_retries}] No output from model. Retrying...") 80 | time.sleep(1) 81 | 82 | else: 83 | # fail by max_retries 84 | raise ValueError("No output from model after multiple retries") 85 | 86 | items += response["output"] 87 | 88 | for item in response["output"]: 89 | items += handle_item(item, computer) 90 | 91 | if items[-1].get("role") == "assistant": 92 | break 93 | 94 | return computer.action_count -------------------------------------------------------------------------------- /game_agent/cradle/gpt_cua/simple_cua_loop.py: -------------------------------------------------------------------------------- 1 | # simple_cua_loop.py or gpt_cua/runner.py 2 | from gpt_cua.computers import LocalDesktopComputer 3 | from gpt_cua.utils import create_response, check_blocklisted_url 4 | 5 | def acknowledge_safety_check_callback(message: str) -> bool: 6 | response = input(f"Safety Check Warning: {message}\nProceed? (y/n): ").lower() 7 | return response.strip() == "y" 8 | 9 | def handle_item(item, computer: LocalDesktopComputer): 10 | if item["type"] == "message": 11 | print(item["content"][0]["text"]) 12 | 13 | if item["type"] == "computer_call": 14 | action = item["action"] 15 | action_type = action["type"] 16 | action_args = {k: v for k, v in action.items() if k != "type"} 17 | print(f"🖱️ {action_type}({action_args})") 18 | 19 | getattr(computer, action_type)(**action_args) 20 | 21 | print(f"🎯 액션 카운트: {computer.action_count}/{computer.max_actions}") 22 | 23 | screenshot_base64 = computer.screenshot() 24 | 25 | checks = item.get("pending_safety_checks", []) 26 | for check in checks: 27 | if not acknowledge_safety_check_callback(check["message"]): 28 | raise ValueError(f"Safety check failed: {check['message']}") 29 | 30 | output = { 31 | "type": "computer_call_output", 32 | "call_id": item["call_id"], 33 | "acknowledged_safety_checks": checks, 34 | "output": { 35 | "type": "input_image", 36 | "image_url": f"data:image/png;base64,{screenshot_base64}", 37 | }, 38 | } 39 | 40 | if computer.environment == "browser": 41 | current_url = computer.get_current_url() 42 | output["output"]["current_url"] = current_url 43 | check_blocklisted_url(current_url) 44 | 45 | return [output] 46 | 47 | return [] 48 | 49 | import time 50 | 51 | def main_gpt_cua(prompt_text=None, max_retries=300): 52 | computer = LocalDesktopComputer(max_actions=300) 53 | tools = [{ 54 | "type": "computer-preview", 55 | "display_width": computer.dimensions[0], 56 | "display_height": computer.dimensions[1], 57 | "environment": computer.environment, 58 | }] 59 | 60 | items = [] 61 | if prompt_text: 62 | items.append({"role": "user", "content": prompt_text}) 63 | else: 64 | user_input = input("> ") 65 | items.append({"role": "user", "content": user_input}) 66 | 67 | while True: 68 | for attempt in range(max_retries): 69 | response = create_response( 70 | model="computer-use-preview", 71 | input=items, 72 | tools=tools, 73 | truncation="auto", 74 | ) 75 | 76 | if "output" in response: 77 | break # 성공했으면 루프 탈출 78 | else: 79 | print(f"[Retry {attempt+1}/{max_retries}] No output from model. Retrying...") 80 | time.sleep(1) # 잠깐 대기 후 재시도 81 | 82 | else: 83 | # max_retries번 모두 실패한 경우 84 | raise ValueError("No output from model after multiple retries") 85 | 86 | items += response["output"] 87 | 88 | for item in response["output"]: 89 | items += handle_item(item, computer) 90 | 91 | if items[-1].get("role") == "assistant": 92 | break 93 | 94 | return computer.action_count -------------------------------------------------------------------------------- /evaluator/judge/vlm/tools/serving/api_providers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dotenv 3 | from openai import OpenAI 4 | import anthropic 5 | import google.generativeai as genai 6 | 7 | # Load .env file 8 | dotenv.load_dotenv() 9 | 10 | ## Distinguish between using images and not using images 11 | def openai_completion(system_prompt, model_name, base64_images, prompt): 12 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 13 | 14 | # Basic message structure 15 | messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": []}] 16 | 17 | # Add multiple images if available 18 | if base64_images: 19 | for base64_image in base64_images: 20 | messages[1]["content"].append({ 21 | "type": "image_url", 22 | "image_url": {"url": f"data:image/png;base64,{base64_image}"}, 23 | }) 24 | 25 | # Add text prompt 26 | messages[1]["content"].append({"type": "text", "text": prompt}) 27 | 28 | response = client.chat.completions.create( 29 | model=model_name, 30 | messages=messages, 31 | temperature=0, 32 | max_tokens=1024, 33 | ) 34 | return response.choices[0].message.content 35 | 36 | def anthropic_completion(system_prompt, model_name, base64_images, prompt): 37 | client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) 38 | 39 | # Construct user message content 40 | user_content = [] 41 | 42 | # Add images if available 43 | if base64_images: 44 | for base64_image in base64_images: 45 | user_content.append({ 46 | "type": "image", 47 | "source": { 48 | "type": "base64", 49 | "media_type": "image/png", 50 | "data": base64_image, 51 | } 52 | }) 53 | 54 | # Add text prompt 55 | user_content.append({ 56 | "type": "text", 57 | "text": prompt 58 | }) 59 | 60 | # Create stream and process response 61 | with client.messages.stream( 62 | max_tokens=1024, 63 | system=system_prompt, # system prompt is passed as a separate parameter 64 | messages=[ 65 | { 66 | "role": "user", 67 | "content": user_content 68 | } 69 | ], 70 | temperature=0, 71 | model=model_name, 72 | ) as stream: 73 | partial_chunks = [] 74 | for chunk in stream.text_stream: 75 | partial_chunks.append(chunk) 76 | 77 | return "".join(partial_chunks) 78 | 79 | def gemini_completion(system_prompt, model_name, base64_images, prompt): 80 | genai.configure(api_key=os.getenv("GEMINI_API_KEY")) 81 | model = genai.GenerativeModel(model_name=model_name) 82 | 83 | # Basic message structure 84 | messages = [ 85 | {"role": "system", "text": system_prompt} 86 | ] 87 | 88 | # Add multiple images if available 89 | if base64_images: 90 | for base64_image in base64_images: 91 | messages.append({ 92 | "mime_type": "image/png", 93 | "data": base64_image, 94 | }) 95 | 96 | # Add text prompt 97 | messages.append(prompt) 98 | 99 | try: 100 | response = model.generate_content(messages) 101 | return response.text 102 | except Exception as e: 103 | print(f"Error: {e}") 104 | return None -------------------------------------------------------------------------------- /game_agent/coast/api/serving/api_providers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dotenv 3 | from openai import OpenAI 4 | import anthropic 5 | import google.generativeai as genai 6 | 7 | # .env load 8 | dotenv.load_dotenv() 9 | 10 | ## Use Image 11 | def openai_completion(system_prompt, model_name, base64_images, prompt): 12 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 13 | 14 | # Message 15 | messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": []}] 16 | 17 | # #image > 1 18 | if base64_images: 19 | for base64_image in base64_images: 20 | messages[1]["content"].append({ 21 | "type": "image_url", 22 | "image_url": {"url": f"data:image/png;base64,{base64_image}"}, 23 | }) 24 | 25 | # adding text prompt 26 | messages[1]["content"].append({"type": "text", "text": prompt}) 27 | 28 | if model_name == "o4-mini-2025-04-16": 29 | response = client.chat.completions.create( 30 | model=model_name, 31 | messages=messages, 32 | max_completion_tokens=3000, 33 | ) 34 | else: 35 | response = client.chat.completions.create( 36 | model=model_name, 37 | messages=messages, 38 | temperature=0, 39 | max_tokens=1024, 40 | ) 41 | 42 | return response.choices[0].message.content 43 | 44 | def anthropic_completion(system_prompt, model_name, base64_images, prompt): 45 | client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) 46 | 47 | user_content = [] 48 | 49 | # adding aditional image 50 | if base64_images: 51 | for base64_image in base64_images: 52 | user_content.append({ 53 | "type": "image", 54 | "source": { 55 | "type": "base64", 56 | "media_type": "image/png", 57 | "data": base64_image, 58 | } 59 | }) 60 | 61 | # adding text prompt 62 | user_content.append({ 63 | "type": "text", 64 | "text": prompt 65 | }) 66 | 67 | with client.messages.stream( 68 | max_tokens=2048, 69 | system=system_prompt, 70 | messages=[ 71 | { 72 | "role": "user", 73 | "content": user_content 74 | } 75 | ], 76 | temperature=0, 77 | model=model_name, 78 | ) as stream: 79 | partial_chunks = [] 80 | for chunk in stream.text_stream: 81 | partial_chunks.append(chunk) 82 | 83 | return "".join(partial_chunks) 84 | 85 | def gemini_completion(system_prompt, model_name, base64_images, prompt): 86 | genai.configure(api_key=os.getenv("GEMINI_API_KEY")) 87 | model = genai.GenerativeModel(model_name=model_name) 88 | 89 | # message 90 | messages = [ 91 | {"role": "system", "text": system_prompt} 92 | ] 93 | 94 | # adding aditional image 95 | if base64_images: 96 | for base64_image in base64_images: 97 | messages.append({ 98 | "mime_type": "image/png", 99 | "data": base64_image, 100 | }) 101 | 102 | # adding text prompt 103 | messages.append(prompt) 104 | 105 | try: 106 | response = model.generate_content(messages) 107 | return response.text 108 | except Exception as e: 109 | print(f"Error: {e}") 110 | return None 111 | -------------------------------------------------------------------------------- /game_agent/gpt_operator/simple_cua_loop.py: -------------------------------------------------------------------------------- 1 | from computers import Computer 2 | from computers import LocalPlaywrightComputer 3 | from utils import create_response, check_blocklisted_url 4 | 5 | 6 | def acknowledge_safety_check_callback(message: str) -> bool: 7 | response = input( 8 | f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " 9 | ).lower() 10 | return response.strip() == "y" 11 | 12 | 13 | def handle_item(item, computer: Computer): 14 | """Handle each item; may cause a computer action + screenshot.""" 15 | if item["type"] == "message": # print messages 16 | print(item["content"][0]["text"]) 17 | 18 | if item["type"] == "computer_call": # perform computer actions 19 | action = item["action"] 20 | action_type = action["type"] 21 | action_args = {k: v for k, v in action.items() if k != "type"} 22 | print(f"{action_type}({action_args})") 23 | 24 | # give our computer environment action to perform 25 | getattr(computer, action_type)(**action_args) 26 | 27 | screenshot_base64 = computer.screenshot() 28 | 29 | pending_checks = item.get("pending_safety_checks", []) 30 | for check in pending_checks: 31 | if not acknowledge_safety_check_callback(check["message"]): 32 | raise ValueError(f"Safety check failed: {check['message']}") 33 | 34 | # return value informs model of the latest screenshot 35 | call_output = { 36 | "type": "computer_call_output", 37 | "call_id": item["call_id"], 38 | "acknowledged_safety_checks": pending_checks, 39 | "output": { 40 | "type": "input_image", 41 | "image_url": f"data:image/png;base64,{screenshot_base64}", 42 | }, 43 | } 44 | 45 | # additional URL safety checks for browser environments 46 | if computer.environment == "browser": 47 | current_url = computer.get_current_url() 48 | call_output["output"]["current_url"] = current_url 49 | check_blocklisted_url(current_url) 50 | 51 | return [call_output] 52 | 53 | return [] 54 | 55 | 56 | def main(): 57 | """Run the CUA (Computer Use Assistant) loop, using Local Playwright.""" 58 | with LocalPlaywrightComputer() as computer: 59 | tools = [ 60 | { 61 | "type": "computer-preview", 62 | "display_width": computer.dimensions[0], 63 | "display_height": computer.dimensions[1], 64 | "environment": computer.environment, 65 | } 66 | ] 67 | 68 | items = [] 69 | while True: # get user input forever 70 | user_input = input("> ") 71 | items.append({"role": "user", "content": user_input}) 72 | 73 | while True: # keep looping until we get a final response 74 | response = create_response( 75 | model="computer-use-preview", 76 | input=items, 77 | tools=tools, 78 | truncation="auto", 79 | ) 80 | 81 | if "output" not in response: 82 | print(response) 83 | raise ValueError("No output from model") 84 | 85 | items += response["output"] 86 | 87 | for item in response["output"]: 88 | items += handle_item(item, computer) 89 | 90 | if items[-1].get("role") == "assistant": 91 | break 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /evaluator/eval_game/eval_gamecafe.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from eval_game.eval_utils import run_milestone 4 | 5 | def parse_continue_flag(msg: str) -> str: 6 | if "Continue: True" in msg: 7 | return "continue" 8 | if "Continue: Final" in msg: 9 | return "final" 10 | return "stop" 11 | 12 | def extract_result(conversation: list[str]) -> str | None: 13 | for msg in reversed(conversation): 14 | if "Result:" in msg: 15 | return msg 16 | return None 17 | 18 | async def eval_gamecafe(): 19 | game = "Game Cafe Escape" 20 | 21 | # Prompt Load 22 | with open("milestone_prompts.json", "r") as f: 23 | all_data = json.load(f) 24 | 25 | data = all_data.get(game, {}) 26 | if not data: 27 | print(f"❌ No data found for game: {game}") 28 | return 29 | 30 | print(f"🎮 Starting evaluation for: {game}") 31 | 32 | # === milestone_prompt1 === 33 | prompt_key = "milestone_prompt1" 34 | if prompt_key in data: 35 | print(f"\n🏁 {game} - {prompt_key}") 36 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 37 | result = extract_result(conversation) 38 | print("🔍 Result:", result) 39 | 40 | status = parse_continue_flag(result or "") 41 | if status != "continue": 42 | print("🛑 Evaluation stopped after milestone 1") 43 | return 44 | 45 | # === milestone_prompt2 === 46 | prompt_key = "milestone_prompt2" 47 | if prompt_key in data: 48 | print(f"\n🏁 {game} - {prompt_key}") 49 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 50 | result = extract_result(conversation) 51 | print("🔍 Result:", result) 52 | 53 | status = parse_continue_flag(result or "") 54 | if status != "continue": 55 | print("🛑 Evaluation stopped after milestone 2") 56 | return 57 | 58 | # === milestone_prompt3 === 59 | prompt_key = "milestone_prompt3" 60 | if prompt_key in data: 61 | print(f"\n🏁 {game} - {prompt_key}") 62 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 63 | result = extract_result(conversation) 64 | print("🔍 Result:", result) 65 | 66 | status = parse_continue_flag(result or "") 67 | if status != "continue": 68 | print("🛑 Evaluation stopped after milestone 3") 69 | return 70 | 71 | # === milestone_prompt4 === 72 | prompt_key = "milestone_prompt4" 73 | if prompt_key in data: 74 | print(f"\n🏁 {game} - {prompt_key}") 75 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 76 | result = extract_result(conversation) 77 | print("🔍 Result:", result) 78 | 79 | status = parse_continue_flag(result or "") 80 | if status != "continue": 81 | print("🛑 Evaluation stopped after milestone 4") 82 | return 83 | 84 | # === milestone_prompt5 === 85 | prompt_key = "milestone_prompt5" 86 | if prompt_key in data: 87 | print(f"\n🏁 {game} - {prompt_key}") 88 | conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}") 89 | result = extract_result(conversation) 90 | print("🔍 Result:", result) 91 | 92 | status = parse_continue_flag(result or "") 93 | if status != "continue": 94 | print("🏁 Final milestone reached or evaluation ends.") 95 | return 96 | 97 | print("✅ All available milestones completed.") 98 | 99 | if __name__ == "__main__": 100 | asyncio.run(eval_gamecafe()) --------------------------------------------------------------------------------