├── game_agent
    ├── UI-Tars
    │   ├── mm_agents
    │   │   ├── __init__.py
    │   │   ├── gui_som
    │   │   │   ├── __init__.py
    │   │   │   ├── data_preparation
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── majestic_million_download.py
    │   │   │   └── READAME.md
    │   │   ├── accessibility_tree_wrap
    │   │   │   ├── __init__.py
    │   │   │   └── relevant_retrieve.py
    │   │   └── README.md
    │   ├── desktop_env
    │   │   ├── __init__.py
    │   │   ├── controllers
    │   │   │   ├── __init__.py
    │   │   │   └── python.py
    │   │   └── desktop_env.py
    │   ├── run.bash
    │   ├── .env
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── screenshot.py
    │   │   └── load_data.py
    │   ├── README.md
    │   └── lib_run_single.py
    ├── cradle
    │   ├── gpt_cua
    │   │   ├── agent
    │   │   │   └── __init__.py
    │   │   ├── computers
    │   │   │   ├── __init__.py
    │   │   │   ├── computer.py
    │   │   │   └── computer_use.py
    │   │   ├── __init__.py
    │   │   ├── run.bash
    │   │   ├── .env
    │   │   ├── log
    │   │   │   └── sherlock_holmes_the_tea_shop_m_You_are_an_AI_agent_specializi_20250325_202931.txt
    │   │   ├── utils.py
    │   │   └── simple_cua_loop.py
    │   ├── .env
    │   ├── claude_cua
    │   │   ├── __init__.py
    │   │   └── tools
    │   │   │   ├── collection.py
    │   │   │   ├── groups.py
    │   │   │   ├── __init__.py
    │   │   │   └── base.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── serving
    │   │   │   └── __init__.py
    │   │   └── api_caller.py
    │   ├── agent
    │   │   └── cradle
    │   │   │   ├── __init__.py
    │   │   │   ├── game_end.py
    │   │   │   └── info_gathering.py
    │   ├── run.bash
    │   ├── gui_grounding
    │   │   ├── __init__.py
    │   │   └── computer
    │   │   │   ├── computer.py
    │   │   │   └── computer_use.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── screenshot.py
    │   │   ├── load_data.py
    │   │   └── utils.py
    │   ├── main.py
    │   └── README.md
    ├── gpt_operator
    │   ├── agent
    │   │   └── __init__.py
    │   ├── .env
    │   ├── computers
    │   │   ├── __init__.py
    │   │   └── computer.py
    │   ├── run.bash
    │   ├── README.md
    │   ├── utils.py
    │   └── simple_cua_loop.py
    ├── coast
    │   ├── gui_agent
    │   │   ├── gpt_cua
    │   │   │   ├── agent
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   ├── computers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── computer.py
    │   │   │   │   └── computer_use.py
    │   │   │   ├── utils.py
    │   │   │   └── simple_cua_loop.py
    │   │   ├── claude_cua
    │   │   │   ├── __init__.py
    │   │   │   └── tools
    │   │   │   │   ├── collection.py
    │   │   │   │   ├── groups.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base.py
    │   │   ├── __init__.py
    │   │   ├── gui_grounding
    │   │   │   ├── __init__.py
    │   │   │   └── computer
    │   │   │   │   ├── computer.py
    │   │   │   │   └── computer_use.py
    │   │   └── execute.py
    │   ├── .env
    │   ├── node
    │   │   └── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── serving
    │   │   │   ├── __init__.py
    │   │   │   └── api_providers.py
    │   │   └── api_caller.py
    │   ├── run.bash
    │   ├── agent
    │   │   └── __init__.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── screenshot.py
    │   │   └── load_data.py
    │   └── config.yaml
    ├── .env
    └── claude_computer_use
    │   ├── .env
    │   ├── run.bash
    │   ├── tools
    │       ├── collection.py
    │       ├── groups.py
    │       ├── __init__.py
    │       └── base.py
    │   └── README.md
├── evaluator
    ├── judge
    │   ├── vlm
    │   │   ├── tools
    │   │   │   ├── serving
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test.py
    │   │   │   │   └── api_providers.py
    │   │   │   └── utils.py
    │   │   ├── __init__.py
    │   │   ├── api_caller.py
    │   │   ├── load_data.py
    │   │   └── screenshot.py
    │   └── computer_use
    │   │   ├── __init__.py
    │   │   └── tools
    │   │       ├── collection.py
    │   │       ├── groups.py
    │   │       ├── __init__.py
    │   │       └── base.py
    ├── run.bash
    ├── .env
    ├── screenshots
    │   ├── crimson_room
    │   │   └── milestone.png
    │   ├── camping_room_escape
    │   │   └── milestone.png
    │   ├── chemical_room_escape
    │   │   └── milestone.png
    │   ├── machine_room_escape
    │   │   ├── milestone1.png
    │   │   ├── milestone2.png
    │   │   └── milestone3.png
    │   ├── space_museum_escape
    │   │   └── milestone.png
    │   ├── vending_machine_room
    │   │   └── milestone.png
    │   ├── wood_workshop_escape
    │   │   └── milestone.png
    │   ├── computer_office_escape
    │   │   └── milestone.png
    │   └── geometric_room_escape
    │   │   └── milestone.png
    ├── eval_game
    │   ├── eval_chemical.py
    │   ├── eval_sort.py
    │   ├── eval_crimson.py
    │   ├── eval_camping.py
    │   ├── eval_wood.py
    │   ├── eval_computer.py
    │   ├── eval_geometric.py
    │   ├── eval_vending.py
    │   ├── eval_utils.py
    │   ├── eval_idol.py
    │   ├── eval_pico.py
    │   ├── eval_grim1.py
    │   ├── eval_kingdom.py
    │   ├── eval_college.py
    │   ├── eval_festival.py
    │   ├── eval_smalltown.py
    │   ├── eval_grim2.py
    │   ├── eval_nickbounty.py
    │   ├── eval_sherlock.py
    │   ├── eval_videostudio.py
    │   ├── eval_paint.py
    │   ├── eval_vortex2.py
    │   ├── eval_saucy.py
    │   ├── eval_design.py
    │   ├── eval_vortex3.py
    │   ├── eval_mirror.py
    │   ├── eval_ray2.py
    │   ├── eval_pierre.py
    │   ├── eval_dakota.py
    │   ├── eval_vortex.py
    │   ├── eval_elevator.py
    │   ├── eval_space.py
    │   ├── eval_sherlock2.py
    │   └── eval_gamecafe.py
    └── evaluate_game.py
├── assets
    ├── fig_coast.png
    ├── fig_obgap.png
    └── fig_keyidea.png
├── .gitignore
└── LICENSE


/game_agent/UI-Tars/mm_agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluator/judge/vlm/tools/serving/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/desktop_env/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/gui_som/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluator/run.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python evaluate_game.py


--------------------------------------------------------------------------------
/game_agent/UI-Tars/desktop_env/controllers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/accessibility_tree_wrap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/gui_som/data_preparation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/run.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python run_uitars.py


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/accessibility_tree_wrap/relevant_retrieve.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import Agent
2 | 


--------------------------------------------------------------------------------
/game_agent/gpt_operator/agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import Agent
2 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import Agent
2 | 


--------------------------------------------------------------------------------
/evaluator/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/UI-Tars/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/coast/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/cradle/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/computers/__init__.py:
--------------------------------------------------------------------------------
1 | from .computer_use import LocalDesktopComputer
2 | 


--------------------------------------------------------------------------------
/assets/fig_coast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_coast.png


--------------------------------------------------------------------------------
/assets/fig_obgap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_obgap.png


--------------------------------------------------------------------------------
/game_agent/gpt_operator/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/assets/fig_keyidea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/assets/fig_keyidea.png


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/.env:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY="your_anthropic_key"
2 | OPENAI_API_KEY="your_openai_key"


--------------------------------------------------------------------------------
/game_agent/coast/node/__init__.py:
--------------------------------------------------------------------------------
1 | from .node_inference import Planner
2 | 
3 | __all__ = [
4 |     Planner
5 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/claude_cua/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import run_agent
2 | 
3 | __all__ = [
4 |     "run_agent"
5 | ]


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/claude_cua/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 
3 | 
4 | 
5 | __all__ = [
6 |     "main"
7 | ]
8 | 


--------------------------------------------------------------------------------
/game_agent/coast/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .api_caller import (
2 |     api_caller
3 | )
4 | 
5 | __all__ = [
6 |     "api_caller"
7 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .api_caller import (
2 |     api_caller
3 | )
4 | 
5 | __all__ = [
6 |     "api_caller"
7 | ]


--------------------------------------------------------------------------------
/game_agent/gpt_operator/computers/__init__.py:
--------------------------------------------------------------------------------
1 | from .computer import Computer
2 | from .computer_use import LocalDesktopComputer
3 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .execute import execute_action
2 | 
3 | 
4 | 
5 | __all__ = [
6 |     execute_action
7 | ]
8 | 


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_cua_loop import(
2 |     main_gpt_cua
3 | )
4 | 
5 | __all__ = [
6 |     "main_gpt_cua"
7 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/agent/cradle/__init__.py:
--------------------------------------------------------------------------------
1 | from .game_agent import(
2 |     run_game_agent
3 | )
4 | 
5 | __all__ = [
6 |     "run_game_agent"
7 | ]


--------------------------------------------------------------------------------
/evaluator/screenshots/crimson_room/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/crimson_room/milestone.png


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_cua_loop import(
2 |     main_gpt_operator
3 | )
4 | 
5 | __all__ = [
6 |     "main_gpt_operator"
7 | ]


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/computers/__init__.py:
--------------------------------------------------------------------------------
1 | from .computer_use import LocalDesktopComputer
2 | 
3 | 
4 | __all__ = [
5 |     "LocalDesktopComputer"
6 | ]


--------------------------------------------------------------------------------
/evaluator/screenshots/camping_room_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/camping_room_escape/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/chemical_room_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/chemical_room_escape/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/machine_room_escape/milestone1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone1.png


--------------------------------------------------------------------------------
/evaluator/screenshots/machine_room_escape/milestone2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone2.png


--------------------------------------------------------------------------------
/evaluator/screenshots/machine_room_escape/milestone3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/machine_room_escape/milestone3.png


--------------------------------------------------------------------------------
/evaluator/screenshots/space_museum_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/space_museum_escape/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/vending_machine_room/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/vending_machine_room/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/wood_workshop_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/wood_workshop_escape/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/computer_office_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/computer_office_escape/milestone.png


--------------------------------------------------------------------------------
/evaluator/screenshots/geometric_room_escape/milestone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahnjaewoo/FlashAdventure/HEAD/evaluator/screenshots/geometric_room_escape/milestone.png


--------------------------------------------------------------------------------
/game_agent/coast/run.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | SCRIPT_NAME="game_agent.py"
5 | CONFIG_FILE="config.yaml"
6 | 
7 | 
8 | python "$SCRIPT_NAME" --config "$CONFIG_FILE"; then
9 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/gui_som/READAME.md:
--------------------------------------------------------------------------------
1 | Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.


--------------------------------------------------------------------------------
/game_agent/coast/agent/__init__.py:
--------------------------------------------------------------------------------
 1 | from .agent import Agent
 2 | 
 3 | from .moduler import (
 4 |     SeekerBot,
 5 |     SolverBot,
 6 |     MapperBot
 7 | )
 8 | 
 9 | __all__ = [
10 |     Agent,
11 |     SeekerBot,
12 |     SolverBot,
13 |     MapperBot
14 | ]
15 | 


--------------------------------------------------------------------------------
/game_agent/coast/api/serving/__init__.py:
--------------------------------------------------------------------------------
 1 | from .api_providers import (
 2 |     openai_completion,
 3 |     anthropic_completion,
 4 |     gemini_completion
 5 | )
 6 | 
 7 | __all__ = [
 8 |     "openai_completion",
 9 |     "anthropic_completion",
10 |     "gemini_completion"
11 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/api/serving/__init__.py:
--------------------------------------------------------------------------------
 1 | from .api_providers import (
 2 |     openai_completion,
 3 |     anthropic_completion,
 4 |     gemini_completion
 5 | )
 6 | 
 7 | __all__ = [
 8 |     "openai_completion",
 9 |     "anthropic_completion",
10 |     "gemini_completion"
11 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/run.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL="claude-3-7-sonnet-20250219"
 4 | PROVIDER="anthropic"
 5 | CUA="claude"
 6 | 
 7 | ## other_options
 8 | # MODEL="gpt-4o"
 9 | # PROVIDER="openai"
10 | # CUA="gpt"
11 | # CUA="sonnet"
12 | # CUA="uground"
13 | 
14 | ## Execute
15 | python main.py --model "$MODEL" --provider "$PROVIDER" --cua "$CUA"


--------------------------------------------------------------------------------
/game_agent/cradle/gui_grounding/__init__.py:
--------------------------------------------------------------------------------
 1 | from .computer.computer_use import(
 2 |     LocalDesktopComputer
 3 | )
 4 | 
 5 | from .uground import(
 6 |     agent_step
 7 | )
 8 | from .claude import(
 9 |     run_claude_gui_agent
10 | )
11 | 
12 | 
13 | __all__ = [
14 |     "LocalDesktopComputer",
15 |     "agent_step",
16 |     "run_claude_gui_agent"
17 | ]


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gui_grounding/__init__.py:
--------------------------------------------------------------------------------
 1 | from .computer.computer_use import(
 2 |     LocalDesktopComputer
 3 | )
 4 | 
 5 | from .uground import(
 6 |     agent_step
 7 | )
 8 | from .claude import(
 9 |     run_claude_gui_agent
10 | )
11 | 
12 | 
13 | __all__ = [
14 |     "LocalDesktopComputer",
15 |     "agent_step",
16 |     "run_claude_gui_agent"
17 | ]


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/run.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH=$(pwd)/..
 4 | 
 5 | # Choose task by number (1-based index from tasks.json)
 6 | 
 7 | TASK_FILE="./json/game_prompt.json"
 8 | PROMPT_TYPE="prompt"
 9 | MAX_ACTIONS=1000
10 | 
11 | python main.py \
12 |   --task-file "$TASK_FILE" \
13 |   --prompt-type "$PROMPT_TYPE" \
14 |   --max-actions "$MAX_ACTIONS"


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/run.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHON=python
 4 | MAIN_SCRIPT="main.py"
 5 | DEFAULT_PROMPT_FILE="tasks.json"
 6 | 
 7 | echo "🚀 Agent 자동화 루프 시작 중..."
 8 | 
 9 | # prompt.json 파일이 존재하면 자동으로 전달
10 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then
11 |     echo "📝 $DEFAULT_PROMPT_FILE 감지됨, 자동으로 프롬프트 사용"
12 |     $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE"
13 | else
14 |     echo "📄 $DEFAULT_PROMPT_FILE 없음, 기본 프롬프트로 실행"
15 |     $PYTHON $MAIN_SCRIPT
16 | fi


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY="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"
2 | # ANTHROPIC_API_KEY="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"
3 | # GEMINI_API_KEY="sk-zzzzzzzzzzzzzzzzzzzzzz"
4 | 
5 | 
6 | # OPENAI_API_KEY="sk-svcacct-GGftbt0z90DZrqmTZ6O8M17wtdQtTHDuxTQIdNec1diEUQSc-DPmSwDi6Zmx4S3EYIYT3BlbkFJhzY7qIGmeQJVIL2GMldI8mFYtwg8vWKOEvoOgjL9SKWXZbZhjsNW3N0u3db-CQJX4JwA"


--------------------------------------------------------------------------------
/evaluator/judge/vlm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | __version__ = "0.1.0"
 7 | __author__ = "FlashAdventure"
 8 | 
 9 | """
10 | Autonomous Computer Use Agent with Claude Sonnet
11 | """
12 | 
13 | from .evaluator import (
14 |     main,
15 | )
16 | 
17 | from .load_data import(
18 |     load_game_prompt_eval,
19 | )
20 | 
21 | 
22 | 
23 | __all__ = [
24 |     "main",
25 |     "load_game_prompt_eval",
26 |     
27 | ]
28 | 


--------------------------------------------------------------------------------
/game_agent/gpt_operator/run.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHON=python
 4 | MAIN_SCRIPT="main.py"
 5 | DEFAULT_PROMPT_FILE="game_prompts.json"
 6 | HISTORY=10
 7 | 
 8 | echo "🚀 Starting agent automation loop..."
 9 | 
10 | # Automatically pass the prompt.json file if it exists
11 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then
12 |     echo "📝 $DEFAULT_PROMPT_FILE detected, using prompt automatically"
13 |     $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE" --history $HISTORY
14 | else
15 |     echo "📄 $DEFAULT_PROMPT_FILE not found, running with default prompt"
16 |     $PYTHON $MAIN_SCRIPT --history $HISTORY
17 | fi


--------------------------------------------------------------------------------
/game_agent/cradle/agent/cradle/game_end.py:
--------------------------------------------------------------------------------
 1 | from api import api_caller
 2 | 
 3 | def game_end(system_prompt, screen, api_provider, model_name):
 4 |     prompt = f"""
 5 |     Current Screen:\n
 6 |     [Image]\n\n
 7 | 
 8 |     Please check whether the current screen indicates that the game has been completely and successfully cleared.\n
 9 |     If successful, the player either escapes the room or sees a message indicating the game has been completed.\n
10 |     If it has, output [Done].\n
11 |     """
12 |     
13 |     response = api_caller(api_provider, system_prompt, model_name, prompt, screen)
14 |     
15 |     return response


--------------------------------------------------------------------------------
/game_agent/UI-Tars/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .load_data import (
 2 |     save_chat_log,
 3 |     load_game_prompt,
 4 |     load_system_prompt
 5 | )
 6 | 
 7 | from .screenshot import (
 8 |     capture_flash_screenshot
 9 | )
10 | 
11 | from .utils import (
12 |     encode_images_to_base64,
13 |     encode_image,
14 |     extract_python_code,
15 |     extract_action_change
16 | )
17 | 
18 | __all__ = [
19 |     "save_chat_log",
20 |     "load_game_prompt",
21 |     "load_game_prompt_eval",
22 |     "capture_flash_screenshot",
23 |     "encode_image",
24 |     "extract_python_code",
25 |     "extract_action_change",
26 |     "encode_images_to_base64",
27 |     "load_system_prompt"
28 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .load_data import (
 2 |     save_chat_log,
 3 |     load_game_prompt,
 4 |     load_system_prompt
 5 | )
 6 | 
 7 | from .screenshot import (
 8 |     capture_flash_screenshot
 9 | )
10 | 
11 | from .utils import (
12 |     encode_images_to_base64,
13 |     encode_image,
14 |     extract_python_code,
15 |     extract_action_change
16 | )
17 | 
18 | __all__ = [
19 |     "save_chat_log",
20 |     "load_game_prompt",
21 |     "load_game_prompt_eval",
22 |     "capture_flash_screenshot",
23 |     "encode_image",
24 |     "extract_python_code",
25 |     "extract_action_change",
26 |     "encode_images_to_base64",
27 |     "load_system_prompt"
28 | ]


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_chemical.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | ## Configuration Dictionary
 6 | config = {
 7 |     "game": "Chemical Room Escape",
 8 |     "api_provider": "anthropic", 
 9 |     "model_name": "claude-3-7-sonnet-20250219",
10 |     "loop_interval": 3
11 | }
12 | 
13 | 
14 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
15 | 
16 |     
17 | config["system_prompt"] = system_prompt
18 | config["evaluation_prompt"] = evaluation_prompt
19 | config["example_image_path"] = example_image_path
20 | 
21 | ## Running
22 | evaluator_none_cua(**config)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_sort.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | ## Configuration Dictionary
 6 | 
 7 | config = {
 8 |     "game": "Sort the Court",
 9 |     "api_provider": "anthropic", 
10 |     "model_name": "claude-3-7-sonnet-20250219",
11 |     "loop_interval": 3
12 | }
13 | 
14 | 
15 | 
16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
17 | 
18 | config["system_prompt"] = system_prompt
19 | config["evaluation_prompt"] = evaluation_prompt
20 | config["example_image_path"] = example_image_path
21 | 
22 | 
23 | ## Running
24 | evaluator_none_cua(**config)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_crimson.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | ## Configuration Dictionary
 6 | 
 7 | 
 8 | config = {
 9 |     "game": "Crimson Room",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
16 | 
17 |     
18 | config["system_prompt"] = system_prompt
19 | config["evaluation_prompt"] = evaluation_prompt
20 | config["example_image_path"] = example_image_path
21 | 
22 | 
23 | ## Running 
24 | evaluator_none_cua(**config)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_camping.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | 
 6 | ## Configuration Dictionary
 7 | 
 8 | config = {
 9 |     "game": "Camping Room Escape",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
16 | 
17 |     
18 | config["system_prompt"] = system_prompt
19 | config["evaluation_prompt"] = evaluation_prompt
20 | config["example_image_path"] = example_image_path
21 | 
22 | 
23 | ## Running
24 | evaluator_none_cua(**config)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_wood.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | 
 6 | ## Configuration Dictionary
 7 | 
 8 | config = {
 9 |     "game": "Wood Workshop Escape",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
16 | 
17 |     
18 | config["system_prompt"] = system_prompt
19 | config["evaluation_prompt"] = evaluation_prompt
20 | config["example_image_path"] = example_image_path
21 | 
22 | 
23 | ## Running
24 | evaluator_none_cua(**config)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_computer.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | 
 6 | ## Configuration Dictionary
 7 | 
 8 | config = {
 9 |     "game": "Computer Office Escape",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
16 | 
17 |     
18 | config["system_prompt"] = system_prompt
19 | config["evaluation_prompt"] = evaluation_prompt
20 | config["example_image_path"] = example_image_path
21 | 
22 | 
23 | ## Running
24 | evaluator_none_cua(**config)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_geometric.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | 
 6 | ## Configuration Dictionary
 7 | 
 8 | config = {
 9 |     "game": "Geometric Room Escape",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | 
16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
17 | 
18 |     
19 | config["system_prompt"] = system_prompt
20 | config["evaluation_prompt"] = evaluation_prompt
21 | config["example_image_path"] = example_image_path
22 | 
23 | 
24 | ## Running
25 | evaluator_none_cua(**config)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_vending.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm import main as evaluator_none_cua
 2 | from judge.vlm import load_game_prompt_eval
 3 | 
 4 | 
 5 | 
 6 | ## Configuration Dictionary
 7 | 
 8 | config = {
 9 |     "game": "Vending Machine Room Escape",
10 |     "api_provider": "anthropic", 
11 |     "model_name": "claude-3-7-sonnet-20250219",
12 |     "loop_interval": 3
13 | }
14 | 
15 | 
16 | system_prompt, evaluation_prompt, example_image_path = load_game_prompt_eval(config["game"])
17 | 
18 |     
19 | config["system_prompt"] = system_prompt
20 | config["evaluation_prompt"] = evaluation_prompt
21 | config["example_image_path"] = example_image_path
22 | 
23 | 
24 | ## Running
25 | evaluator_none_cua(**config)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | import re
 4 | import asyncio
 5 | from pathlib import Path
 6 | from judge.computer_use.main import main as run_main
 7 | 
 8 | TEMP_TASK_FILE = "temp_task.json"
 9 | 
10 | async def run_milestone(prompt_text: str, task_name: str) -> list[str]:
11 |     task_data = {
12 |         task_name: {
13 |             "milestone_prompt": prompt_text
14 |         }
15 |     }
16 |     with open(TEMP_TASK_FILE, "w") as f:
17 |         json.dump(task_data, f)
18 | 
19 |     sys.argv = [
20 |         "main.py",
21 |         "--task-file", TEMP_TASK_FILE,
22 |         "--task-name", task_name,
23 |         "--prompt-type", "milestone_prompt"
24 |     ]
25 | 
26 |     return await run_main()  # returns conversation


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/log/sherlock_holmes_the_tea_shop_m_You_are_an_AI_agent_specializi_20250325_202931.txt:
--------------------------------------------------------------------------------
1 | USER: You are an AI agent specializing in the Sherlock Holmes detective game. Your task is to locate and click the blue notebook icon in the game interface (usually found at the far right of the inventory bar). Once the notebook is opened, do not perform any additional actions. Carefully read the displayed text (do not scroll) and count how many times the phrase 'New Suspect' appears in the note. Once counted, output the result in the following format: ### Output Format #### New Suspect: [Number of occurrences]Do not perform any further interactions after counting. Your task ends once the count is provided.
2 | 
3 | ASSISTANT: New Suspect: 1
4 | 
5 | USER: 
6 | 
7 | 


--------------------------------------------------------------------------------
/evaluator/judge/vlm/api_caller.py:
--------------------------------------------------------------------------------
 1 | from judge.vlm.tools.serving.api_providers import anthropic_completion, openai_completion, gemini_completion
 2 |  
 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_image=None, base64_image2=None):    
 4 |     base64_images = [img for img in [base64_image, base64_image2] if img] 
 5 | 
 6 |     if api_provider == "anthropic":
 7 |         response = anthropic_completion(system_prompt, model_name, base64_images, move_prompts)
 8 |     elif api_provider == "openai":
 9 |         response = openai_completion(system_prompt, model_name, base64_images, move_prompts)
10 |     elif api_provider == "gemini":
11 |         response = gemini_completion(system_prompt, model_name, base64_images, move_prompts)
12 |     else:
13 |         raise NotImplementedError(f"API provider '{api_provider}' is not supported.")
14 |     
15 |     return response
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # 💾 Cache and Temporary Files
 2 | cache/*
 3 | __pycache__/
 4 | */__pycache__/
 5 | *.pyc
 6 | *.pyo
 7 | *.pyd
 8 | *.mp4
 9 | *.sh
10 | 
11 | # 🔧 Compiled Binaries and Object Files
12 | *.o
13 | *.so
14 | *.dll
15 | *.dylib
16 | *.exe
17 | *.out
18 | *.a
19 | *.lib
20 | *.obj
21 | 
22 | # 📄 Logs and Debug Files
23 | *.log
24 | *.tmp
25 | *.swp
26 | *.swo
27 | 
28 | 
29 | # 📌 Build and Dependency Directories
30 | bin/
31 | build/
32 | dist/
33 | *.egg-info/
34 | *.manifest
35 | *.spec
36 | 
37 | # 🏗️ Make and CMake Files
38 | CMakeFiles/
39 | CMakeCache.txt
40 | Makefile
41 | cmake_install.cmake
42 | 
43 | # 🚀 Virtual Environment (Python)
44 | venv/
45 | .venv/
46 | env/
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 | 
50 | # 📝 IDE and Editor Specific Files
51 | .vscode/
52 | .idea/
53 | *.iml
54 | 
55 | # 🔒 OS-Specific Files
56 | .DS_Store
57 | Thumbs.db
58 | desktop.ini
59 | 
60 | # 🗂️ Past Code and Logs
61 | past/
62 | logs/


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/computers/computer.py:
--------------------------------------------------------------------------------
 1 | from typing import Protocol, List, Literal, Dict
 2 | 
 3 | 
 4 | class Computer(Protocol):
 5 |     """Defines the 'shape' (methods/properties) our loop expects."""
 6 | 
 7 |     @property
 8 |     def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ...
 9 |     @property
10 |     def dimensions(self) -> tuple[int, int]: ...
11 | 
12 |     def screenshot(self) -> str: ...
13 | 
14 |     def click(self, x: int, y: int, button: str = "left") -> None: ...
15 | 
16 |     def double_click(self, x: int, y: int) -> None: ...
17 | 
18 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ...
19 | 
20 |     def type(self, text: str) -> None: ...
21 | 
22 |     def wait(self, ms: int = 1000) -> None: ...
23 | 
24 |     def move(self, x: int, y: int) -> None: ...
25 | 
26 |     def keypress(self, keys: List[str]) -> None: ...
27 | 
28 |     def drag(self, path: List[Dict[str, int]]) -> None: ...
29 | 
30 |     def get_current_url() -> str: ...
31 | 


--------------------------------------------------------------------------------
/game_agent/gpt_operator/computers/computer.py:
--------------------------------------------------------------------------------
 1 | from typing import Protocol, List, Literal, Dict
 2 | 
 3 | 
 4 | class Computer(Protocol):
 5 |     """Defines the 'shape' (methods/properties) our loop expects."""
 6 | 
 7 |     @property
 8 |     def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ...
 9 |     @property
10 |     def dimensions(self) -> tuple[int, int]: ...
11 | 
12 |     def screenshot(self) -> str: ...
13 | 
14 |     def click(self, x: int, y: int, button: str = "left") -> None: ...
15 | 
16 |     def double_click(self, x: int, y: int) -> None: ...
17 | 
18 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ...
19 | 
20 |     def type(self, text: str) -> None: ...
21 | 
22 |     def wait(self, ms: int = 1000) -> None: ...
23 | 
24 |     def move(self, x: int, y: int) -> None: ...
25 | 
26 |     def keypress(self, keys: List[str]) -> None: ...
27 | 
28 |     def drag(self, path: List[Dict[str, int]]) -> None: ...
29 | 
30 |     def get_current_url() -> str: ...
31 | 


--------------------------------------------------------------------------------
/evaluator/judge/computer_use/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | __version__ = "0.1.0"
 7 | __author__ = "FlashAdventure"
 8 | 
 9 | """
10 | Autonomous Computer Use Agent with Claude Sonnet
11 | """
12 | 
13 | from .app import (
14 |     get_screen_details,
15 |     load_api_key,
16 |     load_tasks,
17 |     get_task_prompt,
18 |     api_response_callback,
19 |     tool_output_callback,
20 |     message_callback,
21 |     run_agent,
22 |     main,
23 | )
24 | 
25 | # loop 모듈
26 | from .loop import (
27 |     APIProvider,
28 |     sampling_loop,
29 | )
30 | 
31 | __all__ = [
32 |     "get_screen_details",
33 |     "load_api_key",
34 |     "load_tasks",
35 |     "get_task_prompt",
36 |     "api_response_callback",
37 |     "tool_output_callback",
38 |     "message_callback",
39 |     "run_agent",
40 |     "main",
41 |     "APIProvider",
42 |     "sampling_loop",
43 | ]
44 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/computers/computer.py:
--------------------------------------------------------------------------------
 1 | from typing import Protocol, List, Literal, Dict
 2 | 
 3 | 
 4 | class Computer(Protocol):
 5 |     """Defines the 'shape' (methods/properties) our loop expects."""
 6 | 
 7 |     @property
 8 |     def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ...
 9 |     @property
10 |     def dimensions(self) -> tuple[int, int]: ...
11 | 
12 |     def screenshot(self) -> str: ...
13 | 
14 |     def click(self, x: int, y: int, button: str = "left") -> None: ...
15 | 
16 |     def double_click(self, x: int, y: int) -> None: ...
17 | 
18 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ...
19 | 
20 |     def type(self, text: str) -> None: ...
21 | 
22 |     def wait(self, ms: int = 1000) -> None: ...
23 | 
24 |     def move(self, x: int, y: int) -> None: ...
25 | 
26 |     def keypress(self, keys: List[str]) -> None: ...
27 | 
28 |     def drag(self, path: List[Dict[str, int]]) -> None: ...
29 | 
30 |     def get_current_url() -> str: ...
31 | 


--------------------------------------------------------------------------------
/evaluator/judge/vlm/tools/serving/test.py:
--------------------------------------------------------------------------------
 1 | import anthropic
 2 | import dotenv
 3 | import os
 4 | import anthropic
 5 | 
 6 | # .env 파일 로드
 7 | dotenv.load_dotenv()
 8 | 
 9 | client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
10 | 
11 | response = client.beta.messages.create(
12 |     model="claude-3-7-sonnet-20250219",
13 |     max_tokens=1024,
14 |     tools=[
15 |         {
16 |           "type": "computer_20250124",
17 |           "name": "computer",
18 |           "display_width_px": 1024,
19 |           "display_height_px": 768,
20 |           "display_number": 1,
21 |         },
22 |         {
23 |           "type": "text_editor_20241022",
24 |           "name": "str_replace_editor"
25 |         },
26 |         {
27 |           "type": "bash_20241022",
28 |           "name": "bash"
29 |         }
30 |     ],
31 |     messages=[{"role": "user", "content": "Save a picture of a cat to my desktop."}],
32 |     betas=["computer-use-2025-01-24"],
33 |     thinking={"type": "enabled", "budget_tokens": 1024}
34 | )
35 | print(response)
36 | 


--------------------------------------------------------------------------------
/evaluator/judge/computer_use/tools/collection.py:
--------------------------------------------------------------------------------
 1 | """Collection classes for managing multiple tools."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | from .base import (
 8 |     BaseAnthropicTool,
 9 |     ToolError,
10 |     ToolFailure,
11 |     ToolResult,
12 | )
13 | 
14 | 
15 | class ToolCollection:
16 |     """A collection of anthropic-defined tools."""
17 | 
18 |     def __init__(self, *tools: BaseAnthropicTool):
19 |         self.tools = tools
20 |         self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
21 | 
22 |     def to_params(
23 |         self,
24 |     ) -> list[BetaToolUnionParam]:
25 |         return [tool.to_params() for tool in self.tools]
26 | 
27 |     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28 |         tool = self.tool_map.get(name)
29 |         if not tool:
30 |             return ToolFailure(error=f"Tool {name} is invalid")
31 |         try:
32 |             return await tool(**tool_input)
33 |         except ToolError as e:
34 |             return ToolFailure(error=e.message)
35 | 


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/tools/collection.py:
--------------------------------------------------------------------------------
 1 | """Collection classes for managing multiple tools."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | from .base import (
 8 |     BaseAnthropicTool,
 9 |     ToolError,
10 |     ToolFailure,
11 |     ToolResult,
12 | )
13 | 
14 | 
15 | class ToolCollection:
16 |     """A collection of anthropic-defined tools."""
17 | 
18 |     def __init__(self, *tools: BaseAnthropicTool):
19 |         self.tools = tools
20 |         self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
21 | 
22 |     def to_params(
23 |         self,
24 |     ) -> list[BetaToolUnionParam]:
25 |         return [tool.to_params() for tool in self.tools]
26 | 
27 |     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28 |         tool = self.tool_map.get(name)
29 |         if not tool:
30 |             return ToolFailure(error=f"Tool {name} is invalid")
31 |         try:
32 |             return await tool(**tool_input)
33 |         except ToolError as e:
34 |             return ToolFailure(error=e.message)
35 | 


--------------------------------------------------------------------------------
/game_agent/cradle/claude_cua/tools/collection.py:
--------------------------------------------------------------------------------
 1 | """Collection classes for managing multiple tools."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | from .base import (
 8 |     BaseAnthropicTool,
 9 |     ToolError,
10 |     ToolFailure,
11 |     ToolResult,
12 | )
13 | 
14 | 
15 | class ToolCollection:
16 |     """A collection of anthropic-defined tools."""
17 | 
18 |     def __init__(self, *tools: BaseAnthropicTool):
19 |         self.tools = tools
20 |         self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
21 | 
22 |     def to_params(
23 |         self,
24 |     ) -> list[BetaToolUnionParam]:
25 |         return [tool.to_params() for tool in self.tools]
26 | 
27 |     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28 |         tool = self.tool_map.get(name)
29 |         if not tool:
30 |             return ToolFailure(error=f"Tool {name} is invalid")
31 |         try:
32 |             return await tool(**tool_input)
33 |         except ToolError as e:
34 |             return ToolFailure(error=e.message)
35 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/claude_cua/tools/collection.py:
--------------------------------------------------------------------------------
 1 | """Collection classes for managing multiple tools."""
 2 | 
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | from .base import (
 8 |     BaseAnthropicTool,
 9 |     ToolError,
10 |     ToolFailure,
11 |     ToolResult,
12 | )
13 | 
14 | 
15 | class ToolCollection:
16 |     """A collection of anthropic-defined tools."""
17 | 
18 |     def __init__(self, *tools: BaseAnthropicTool):
19 |         self.tools = tools
20 |         self.tool_map = {tool.to_params()["name"]: tool for tool in tools}
21 | 
22 |     def to_params(
23 |         self,
24 |     ) -> list[BetaToolUnionParam]:
25 |         return [tool.to_params() for tool in self.tools]
26 | 
27 |     async def run(self, *, name: str, tool_input: dict[str, Any]) -> ToolResult:
28 |         tool = self.tool_map.get(name)
29 |         if not tool:
30 |             return ToolFailure(error=f"Tool {name} is invalid")
31 |         try:
32 |             return await tool(**tool_input)
33 |         except ToolError as e:
34 |             return ToolFailure(error=e.message)
35 | 


--------------------------------------------------------------------------------
/game_agent/coast/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .load_data import (
 2 |     save_chat_log,
 3 |     load_game_prompt,
 4 |     load_config,
 5 |     load_memory,
 6 |     load_action_prompt
 7 | )
 8 | 
 9 | from .screenshot import (
10 |     capture_flash_screenshot
11 | )
12 | 
13 | from .utils import (
14 |     encode_images_to_base64,
15 |     encode_image,
16 |     extract_python_code,
17 |     extract_action_change,
18 |     append_to_json_list,
19 |     extract_clues_from_text, 
20 |     extract_episodic_memory_from_text,
21 |     extract_json_block_from_response
22 | 
23 | )
24 | 
25 | __all__ = [
26 |     "save_chat_log",
27 |     "load_game_prompt",
28 |     "load_game_prompt_eval",
29 |     "capture_flash_screenshot",
30 |     "encode_image",
31 |     "extract_python_code",
32 |     "extract_action_change",
33 |     "encode_images_to_base64",
34 |     "load_system_prompt",
35 |     "load_config",
36 |     "load_action_prompt",
37 |     "load_memory",
38 |     "append_to_json_list",
39 |     "extract_json_from_messages",
40 |     "extract_clues_from_text", 
41 |     "extract_episodic_memory_from_text",
42 |     "extract_json_block_from_response"
43 | ]


--------------------------------------------------------------------------------
/game_agent/coast/config.yaml:
--------------------------------------------------------------------------------
 1 | # If even one action succeeds, stop the entire tree traversal and restart from the root task
 2 | RESET_ON_SUCCESS: true
 3 | max_action_count: 1000
 4 | 
 5 | # Prompt path
 6 | 
 7 | action_prompt_path: "./json/action_prompt.json"
 8 | game_prompt_path: "./json/game_prompt.json"
 9 | 
10 | 
11 | 
12 | # Model settings
13 | # api_provider: "openai"
14 | # reasoning_model: "gpt-4o"
15 | # gui_model: "gpt_operator"  # or gpt_operator / claude_cua / uground
16 | 
17 | api_provider: "anthropic"
18 | reasoning_model: "claude-3-7-sonnet-20250219"
19 | gui_model: "claude_cua"  # or gpt_operator / claude_cua / uground
20 | 
21 | # # Tree expansion settings
22 | # width: 3 # Number of subtasks to generate in one step
23 | # depth: 3 # Total search depth
24 | 
25 | # Grounding resolution settings (used in UGround, etc.)
26 | grounding_width: 1366
27 | grounding_height: 768
28 | 
29 | # Other options (optional)
30 | timeout: 30       # API timeout in seconds
31 | max_steps: 100    # Maximum number of execution steps
32 | 
33 | 
34 | ### Clue_Solver Max Action:
35 | max_actions_solver: 5
36 | ### Clue_Seeker Max Action:
37 | max_actions_seeker: 15


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Jaewoo Ahn, Junseo Kim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/gui_som/data_preparation/majestic_million_download.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | # Latest run on 2024.1.4
 7 | def download_csv(url, file_path):
 8 |     response = requests.get(url)
 9 |     with open(file_path, 'w', newline='', encoding='utf-8') as file:
10 |         file.write(response.text)
11 | 
12 | 
13 | def read_csv(file_path):
14 |     urls = []
15 |     with open(file_path, newline='', encoding='utf-8') as csvfile:
16 |         reader = csv.reader(csvfile)
17 |         next(reader, None)  # Skip the header
18 |         for row in reader:
19 |             urls.append(row[2])  # Assuming the URL is in the third column
20 |     return urls
21 | 
22 | 
23 | def main():
24 |     url = 'http://downloads.majestic.com/majestic_million.csv'
25 |     file_path = 'majestic_million.csv'
26 | 
27 |     print("Downloading Majestic Million CSV...")
28 |     download_csv(url, file_path)
29 | 
30 |     print("Reading URLs from CSV...")
31 |     urls = read_csv(file_path)
32 | 
33 |     # Print the first 10 URLs as a sample
34 |     for url in urls[:10]:
35 |         print(url)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/evaluator/judge/vlm/load_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os 
 3 | 
 4 | def save_chat_log(entry, LOG_FILE):
 5 |     """ Save game move log to a JSON file """
 6 |     logs = []
 7 |     if os.path.exists(LOG_FILE):
 8 |         with open(LOG_FILE, "r") as f:
 9 |             try:
10 |                 logs = json.load(f)
11 |             except json.JSONDecodeError:
12 |                 logs = []
13 |     
14 |     logs.append(entry)
15 |     with open(LOG_FILE, "w") as f:
16 |         json.dump(logs, f, indent=4)
17 | 
18 |     
19 | def load_game_prompt_eval(game_name, image_num = 1):
20 |     json_path = "./milestone_prompts.json"
21 |     """ Load prompt and control keys for a specific game from JSON """
22 |     with open(json_path, "r") as f:
23 |         game_data = json.load(f)
24 | 
25 |     if game_name in game_data:
26 |         prompt = game_data[game_name].get("prompt")
27 |         eval_prompt = game_data[game_name].get("evaluation_prompt")
28 |         example_image_path = game_data[game_name].get(f"example_image_path{image_num}", None)
29 | 
30 |         return prompt, eval_prompt, example_image_path
31 |     else:
32 |         raise ValueError(f"No prompt found for game '{game_name}'.")


--------------------------------------------------------------------------------
/evaluator/judge/computer_use/tools/groups.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | from .base import BaseAnthropicTool
 5 | from .bash import ShellTool20241022, ShellTool20250124
 6 | from .computer import ComputerTool20241022, ComputerTool20250124
 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
 8 | 
 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"]
10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"]
11 | 
12 | 
13 | @dataclass(frozen=True, kw_only=True)
14 | class ToolGroup:
15 |     version: ToolVersion
16 |     tools: list[type[BaseAnthropicTool]]
17 |     beta_flag: BetaFlag | None = None
18 | 
19 | 
20 | TOOL_GROUPS: list[ToolGroup] = [
21 |     ToolGroup(
22 |         version="computer_use_20241022",
23 |         tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022],
24 |         beta_flag="computer-use-2024-10-22",
25 |     ),
26 |     ToolGroup(
27 |         version="computer_use_20250124",
28 |         tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124],
29 |         beta_flag="computer-use-2025-01-24",
30 |     ),
31 | ]
32 | 
33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS}


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/tools/groups.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | from .base import BaseAnthropicTool
 5 | from .bash import ShellTool20241022, ShellTool20250124
 6 | from .computer import ComputerTool20241022, ComputerTool20250124
 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
 8 | 
 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"]
10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"]
11 | 
12 | 
13 | @dataclass(frozen=True, kw_only=True)
14 | class ToolGroup:
15 |     version: ToolVersion
16 |     tools: list[type[BaseAnthropicTool]]
17 |     beta_flag: BetaFlag | None = None
18 | 
19 | 
20 | TOOL_GROUPS: list[ToolGroup] = [
21 |     ToolGroup(
22 |         version="computer_use_20241022",
23 |         tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022],
24 |         beta_flag="computer-use-2024-10-22",
25 |     ),
26 |     ToolGroup(
27 |         version="computer_use_20250124",
28 |         tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124],
29 |         beta_flag="computer-use-2025-01-24",
30 |     ),
31 | ]
32 | 
33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS}


--------------------------------------------------------------------------------
/game_agent/cradle/claude_cua/tools/groups.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | from .base import BaseAnthropicTool
 5 | from .bash import ShellTool20241022, ShellTool20250124
 6 | from .computer import ComputerTool20241022, ComputerTool20250124
 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
 8 | 
 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"]
10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"]
11 | 
12 | 
13 | @dataclass(frozen=True, kw_only=True)
14 | class ToolGroup:
15 |     version: ToolVersion
16 |     tools: list[type[BaseAnthropicTool]]
17 |     beta_flag: BetaFlag | None = None
18 | 
19 | 
20 | TOOL_GROUPS: list[ToolGroup] = [
21 |     ToolGroup(
22 |         version="computer_use_20241022",
23 |         tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022],
24 |         beta_flag="computer-use-2024-10-22",
25 |     ),
26 |     ToolGroup(
27 |         version="computer_use_20250124",
28 |         tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124],
29 |         beta_flag="computer-use-2025-01-24",
30 |     ),
31 | ]
32 | 
33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS}


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/claude_cua/tools/groups.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Literal
 3 | 
 4 | from .base import BaseAnthropicTool
 5 | from .bash import ShellTool20241022, ShellTool20250124
 6 | from .computer import ComputerTool20241022, ComputerTool20250124
 7 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
 8 | 
 9 | ToolVersion = Literal["computer_use_20250124", "computer_use_20241022"]
10 | BetaFlag = Literal["computer-use-2024-10-22", "computer-use-2025-01-24"]
11 | 
12 | 
13 | @dataclass(frozen=True, kw_only=True)
14 | class ToolGroup:
15 |     version: ToolVersion
16 |     tools: list[type[BaseAnthropicTool]]
17 |     beta_flag: BetaFlag | None = None
18 | 
19 | 
20 | TOOL_GROUPS: list[ToolGroup] = [
21 |     ToolGroup(
22 |         version="computer_use_20241022",
23 |         tools=[ComputerTool20241022, CrossPlatformEditTool20241022, ShellTool20241022],
24 |         beta_flag="computer-use-2024-10-22",
25 |     ),
26 |     ToolGroup(
27 |         version="computer_use_20250124",
28 |         tools=[ComputerTool20250124, CrossPlatformEditTool20250124, ShellTool20250124],
29 |         beta_flag="computer-use-2025-01-24",
30 |     ),
31 | ]
32 | 
33 | TOOL_GROUPS_BY_VERSION = {tool_group.version: tool_group for tool_group in TOOL_GROUPS}


--------------------------------------------------------------------------------
/game_agent/cradle/gui_grounding/computer/computer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Dict, Literal
 3 | 
 4 | class Computer(ABC):
 5 |     @property
 6 |     @abstractmethod
 7 |     def environment(self) -> Literal["windows", "mac", "linux"]:
 8 |         pass
 9 | 
10 |     @property
11 |     @abstractmethod
12 |     def dimensions(self) -> tuple[int, int]:
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def screenshot(self) -> str:
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def click(self, x: int, y: int, button: str = "left") -> None:
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def double_click(self, x: int, y: int) -> None:
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def type(self, text: str) -> None:
33 |         pass
34 | 
35 |     @abstractmethod
36 |     def wait(self, ms: int = 1000) -> None:
37 |         pass
38 | 
39 |     @abstractmethod
40 |     def move(self, x: int, y: int) -> None:
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def keypress(self, keys: List[str]) -> None:
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def drag(self, path: List[Dict[str, int]]) -> None:
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def get_current_url(self) -> str:
53 |         pass


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gui_grounding/computer/computer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Dict, Literal
 3 | 
 4 | class Computer(ABC):
 5 |     @property
 6 |     @abstractmethod
 7 |     def environment(self) -> Literal["windows", "mac", "linux"]:
 8 |         pass
 9 | 
10 |     @property
11 |     @abstractmethod
12 |     def dimensions(self) -> tuple[int, int]:
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def screenshot(self) -> str:
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def click(self, x: int, y: int, button: str = "left") -> None:
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def double_click(self, x: int, y: int) -> None:
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def type(self, text: str) -> None:
33 |         pass
34 | 
35 |     @abstractmethod
36 |     def wait(self, ms: int = 1000) -> None:
37 |         pass
38 | 
39 |     @abstractmethod
40 |     def move(self, x: int, y: int) -> None:
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def keypress(self, keys: List[str]) -> None:
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def drag(self, path: List[Dict[str, int]]) -> None:
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def get_current_url(self) -> str:
53 |         pass


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/execute.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from gui_agent.gpt_cua import main_gpt_operator
 3 | from gui_agent.claude_cua import main as main_claude_cua
 4 | from gui_agent.gui_grounding import agent_step as main_uground
 5 | from gui_agent.gui_grounding import run_claude_gui_agent as main_claude_sonnet
 6 | 
 7 | def execute_action(action_prompt, system_prompt=None, encoded_image=None, gui_model="gpt_operator", reasoning_model="gpt-4o", type=None):
 8 |     if gui_model == "gpt_operator":
 9 |         return main_gpt_operator(
10 |             user_prompt=action_prompt
11 |         )
12 |     
13 |     elif gui_model == "claude_cua":
14 |         return asyncio.run(main_claude_cua(
15 |             user_prompt=action_prompt,
16 |             system_prompt=system_prompt,
17 |             type=type
18 |         ))
19 |     
20 |     elif gui_model == "uground":
21 |         api_provider = "openai" if reasoning_model == "gpt-4o" else "anthropic"
22 |         return asyncio.run(main_uground(
23 |             user_prompt=action_prompt,
24 |             encoded_image=encoded_image,
25 |             provider=api_provider,
26 |             model=reasoning_model
27 |         ))
28 |         
29 |     elif gui_model == "claude_sonnet":
30 |         return asyncio.run(main_claude_sonnet(
31 |             user_prompt=action_prompt,
32 |             encoded_image=encoded_image
33 |         ))
34 |     else:
35 |         print(f"[ERROR] Unknown gui_model: {gui_model}")
36 |         return 0


--------------------------------------------------------------------------------
/evaluator/judge/computer_use/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult
 7 | from .collection import ToolCollection
 8 | from .bash import ShellTool20241022, ShellTool20250124
 9 | from .computer import ComputerTool20241022, ComputerTool20250124
10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag
12 | 
13 | __all__ = [
14 |     "BaseAnthropicTool",
15 |     "ToolResult",
16 |     "ToolError",
17 |     "ToolFailure",
18 |     "ToolCollection",
19 |     "ShellTool20241022",
20 |     "ShellTool20250124",
21 |     "ComputerTool20241022",
22 |     "ComputerTool20250124",
23 |     "CrossPlatformEditTool20241022",
24 |     "CrossPlatformEditTool20250124",
25 |     "TOOL_GROUPS",
26 |     "TOOL_GROUPS_BY_VERSION",
27 |     "ToolVersion",
28 |     "BetaFlag",
29 | ]
30 | 
31 | # Aliases for backwards compatibility
32 | from .bash import ShellTool20250124 as BashTool20250124
33 | from .bash import ShellTool20241022 as BashTool20241022
34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124
35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022
36 | 
37 | __all__ += [
38 |     "BashTool20250124",
39 |     "BashTool20241022",
40 |     "EditTool20250124",
41 |     "EditTool20241022",
42 | ]


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult
 7 | from .collection import ToolCollection
 8 | from .bash import ShellTool20241022, ShellTool20250124
 9 | from .computer import ComputerTool20241022, ComputerTool20250124
10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag
12 | 
13 | __all__ = [
14 |     "BaseAnthropicTool",
15 |     "ToolResult",
16 |     "ToolError",
17 |     "ToolFailure",
18 |     "ToolCollection",
19 |     "ShellTool20241022",
20 |     "ShellTool20250124",
21 |     "ComputerTool20241022",
22 |     "ComputerTool20250124",
23 |     "CrossPlatformEditTool20241022",
24 |     "CrossPlatformEditTool20250124",
25 |     "TOOL_GROUPS",
26 |     "TOOL_GROUPS_BY_VERSION",
27 |     "ToolVersion",
28 |     "BetaFlag",
29 | ]
30 | 
31 | # Aliases for backwards compatibility
32 | from .bash import ShellTool20250124 as BashTool20250124
33 | from .bash import ShellTool20241022 as BashTool20241022
34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124
35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022
36 | 
37 | __all__ += [
38 |     "BashTool20250124",
39 |     "BashTool20241022",
40 |     "EditTool20250124",
41 |     "EditTool20241022",
42 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/claude_cua/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult
 7 | from .collection import ToolCollection
 8 | from .bash import ShellTool20241022, ShellTool20250124
 9 | from .computer import ComputerTool20241022, ComputerTool20250124
10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag
12 | 
13 | __all__ = [
14 |     "BaseAnthropicTool",
15 |     "ToolResult",
16 |     "ToolError",
17 |     "ToolFailure",
18 |     "ToolCollection",
19 |     "ShellTool20241022",
20 |     "ShellTool20250124",
21 |     "ComputerTool20241022",
22 |     "ComputerTool20250124",
23 |     "CrossPlatformEditTool20241022",
24 |     "CrossPlatformEditTool20250124",
25 |     "TOOL_GROUPS",
26 |     "TOOL_GROUPS_BY_VERSION",
27 |     "ToolVersion",
28 |     "BetaFlag",
29 | ]
30 | 
31 | # Aliases for backwards compatibility
32 | from .bash import ShellTool20250124 as BashTool20250124
33 | from .bash import ShellTool20241022 as BashTool20241022
34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124
35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022
36 | 
37 | __all__ += [
38 |     "BashTool20250124",
39 |     "BashTool20241022",
40 |     "EditTool20250124",
41 |     "EditTool20241022",
42 | ]


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/claude_cua/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cross-platform computer interaction tools for Anthropic AI.
 3 | This package provides tools for AI to interact with computers running Windows, macOS, or Linux.
 4 | """
 5 | 
 6 | from .base import BaseAnthropicTool, ToolError, ToolFailure, ToolResult
 7 | from .collection import ToolCollection
 8 | from .bash import ShellTool20241022, ShellTool20250124
 9 | from .computer import ComputerTool20241022, ComputerTool20250124
10 | from .edit import CrossPlatformEditTool20241022, CrossPlatformEditTool20250124
11 | from .groups import TOOL_GROUPS, TOOL_GROUPS_BY_VERSION, ToolVersion, BetaFlag
12 | 
13 | __all__ = [
14 |     "BaseAnthropicTool",
15 |     "ToolResult",
16 |     "ToolError",
17 |     "ToolFailure",
18 |     "ToolCollection",
19 |     "ShellTool20241022",
20 |     "ShellTool20250124",
21 |     "ComputerTool20241022",
22 |     "ComputerTool20250124",
23 |     "CrossPlatformEditTool20241022",
24 |     "CrossPlatformEditTool20250124",
25 |     "TOOL_GROUPS",
26 |     "TOOL_GROUPS_BY_VERSION",
27 |     "ToolVersion",
28 |     "BetaFlag",
29 | ]
30 | 
31 | # Aliases for backwards compatibility
32 | from .bash import ShellTool20250124 as BashTool20250124
33 | from .bash import ShellTool20241022 as BashTool20241022
34 | from .edit import CrossPlatformEditTool20250124 as EditTool20250124
35 | from .edit import CrossPlatformEditTool20241022 as EditTool20241022
36 | 
37 | __all__ += [
38 |     "BashTool20250124",
39 |     "BashTool20241022",
40 |     "EditTool20250124",
41 |     "EditTool20241022",
42 | ]


--------------------------------------------------------------------------------
/game_agent/cradle/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from agent.cradle import run_game_agent
 3 | import json
 4 | import time
 5 | 
 6 | def select_game_from_json(prompt_file_path="./json/game_prompts.json"):
 7 |     with open(prompt_file_path, "r", encoding="utf-8") as f:
 8 |         game_data = json.load(f)
 9 | 
10 |     game_names = list(game_data.keys())
11 | 
12 |     print("🎮 Select a game to play:")
13 |     for idx, name in enumerate(game_names, start=1):
14 |         print(f"{idx}. {name}")
15 | 
16 |     while True:
17 |         try:
18 |             choice = int(input("\nEnter number ▶ "))
19 |             if 1 <= choice <= len(game_names):
20 |                 return game_names[choice - 1]
21 |             else:
22 |                 print("❌ Invalid number. Please try again.")
23 |         except ValueError:
24 |             print("❌ Please enter a number.")
25 | 
26 | 
27 | def main():
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--model", default="gpt-4")
30 |     parser.add_argument("--provider", default="openai")
31 |     parser.add_argument("--cua", default="gpt")
32 |     parser.add_argument("--max_actions", default=1000)
33 |     args = parser.parse_args()
34 |     
35 |     game_name = select_game_from_json("./json/game_prompts.json")
36 |     
37 |     print("\n⏳ Preparing game... please wait.")
38 |     time.sleep(5)  # ✅ Delay before execution
39 | 
40 |     result = run_game_agent(
41 |         api_provider=args.provider,
42 |         model_name=args.model,
43 |         game_name=game_name,
44 |         cua=args.cua,
45 |         max_actions=args.max_actions
46 |     )
47 | 
48 |     print("\n📦 Final execution result:")
49 |     print(result)
50 | 
51 | if __name__ == "__main__":
52 |     main()


--------------------------------------------------------------------------------
/game_agent/cradle/agent/cradle/info_gathering.py:
--------------------------------------------------------------------------------
 1 | from api import api_caller
 2 |  
 3 | def info_gather(system_prompt, api_provider, model_name, before_encoded):
 4 |     """
 5 |     현재 화면을 캡처하고 AI로 분석하여 화면 정보를 수집합니다.
 6 |     capture this screen and gather information
 7 |     """
 8 |     
 9 |     
10 |     prompt = """
11 |     The following image is a screenshot of the current computer screen.\n
12 |     Carefully observe the screen and identify all **key visual elements**, such as:\n\n
13 | 
14 |     - Visible text (e.g., labels, instructions, titles, tooltips)\n
15 |     - Interactive elements (e.g., buttons, icons, menus, input fields, sliders)\n
16 |     - Status indicators or feedback messages\n
17 |     - Any notable layout structures or visual groupings\n\n
18 | 
19 |     Objectives:\n
20 |     1. **Summarize** the most important information presented on the screen.\n
21 |     - What is the screen showing? What appears to be the current context or purpose?\n
22 |     - Is there any indication of user progress, task status, or instructions?\n
23 | 
24 |     2. **List all possible user actions** based on what's visible.\n
25 |     - Describe the actions in a clear and specific way (e.g., “Click the ‘Submit’ button”, “Type text into the search field”).\n
26 |     - Include both obvious and subtle affordances, such as hovering, scrolling, or expanding menus.\n\n
27 | 
28 |     Be as detailed and comprehensive as possible. Your analysis should help another agent understand what this screen is about and what can be done on it.\n
29 |     """
30 |     
31 |     
32 |     # API caller
33 |     response = api_caller(api_provider, system_prompt, model_name, prompt, before_encoded)
34 |     
35 |     return response
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     result = info_gather()
40 |     print(result)


--------------------------------------------------------------------------------
/game_agent/coast/api/api_caller.py:
--------------------------------------------------------------------------------
 1 | from api.serving import anthropic_completion, openai_completion, gemini_completion
 2 | 
 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_images=None):
 4 |     """
 5 |     Unified API caller for multiple model providers.
 6 |     
 7 |     Parameters:
 8 |         - api_provider (str): "anthropic", "openai", or "gemini"
 9 |         - system_prompt (str): System-level instruction
10 |         - model_name (str): Model identifier (e.g., "gpt-4", "claude-3")
11 |         - move_prompts (str): Main user prompt for this action
12 |         - base64_images (str | list[str] | None): Single base64 image or list of them
13 |     
14 |     Returns:
15 |         - response (str): Textual result from model
16 |     """
17 | 
18 |     # --- Normalize image input ---
19 |     if isinstance(base64_images, str):
20 |         base64_images = [base64_images]
21 |     elif base64_images is None:
22 |         base64_images = []
23 |     elif not isinstance(base64_images, list):
24 |         raise TypeError("base64_images must be a base64 string, a list of strings, or None.")
25 | 
26 |     if not all(isinstance(img, str) for img in base64_images):
27 |         raise ValueError("Each item in base64_images must be a string.")
28 | 
29 |     # --- Dispatch based on provider ---
30 |     if api_provider == "anthropic":
31 |         return anthropic_completion(system_prompt, model_name, base64_images, move_prompts)
32 | 
33 |     elif api_provider == "openai":
34 |         return openai_completion(system_prompt, model_name, base64_images, move_prompts)
35 | 
36 |     elif api_provider == "gemini":
37 |         return gemini_completion(system_prompt, model_name, base64_images, move_prompts)
38 | 
39 |     else:
40 |         raise NotImplementedError(f"Unsupported API provider: '{api_provider}'")


--------------------------------------------------------------------------------
/game_agent/cradle/api/api_caller.py:
--------------------------------------------------------------------------------
 1 | from api.serving import anthropic_completion, openai_completion, gemini_completion
 2 | 
 3 | def api_caller(api_provider, system_prompt, model_name, move_prompts, base64_images=None):
 4 |     """
 5 |     Unified API caller for multiple model providers.
 6 |     
 7 |     Parameters:
 8 |         - api_provider (str): "anthropic", "openai", or "gemini"
 9 |         - system_prompt (str): System-level instruction
10 |         - model_name (str): Model identifier (e.g., "gpt-4", "claude-3")
11 |         - move_prompts (str): Main user prompt for this action
12 |         - base64_images (str | list[str] | None): Single base64 image or list of them
13 |     
14 |     Returns:
15 |         - response (str): Textual result from model
16 |     """
17 | 
18 |     # --- Normalize image input ---
19 |     if isinstance(base64_images, str):
20 |         base64_images = [base64_images]
21 |     elif base64_images is None:
22 |         base64_images = []
23 |     elif not isinstance(base64_images, list):
24 |         raise TypeError("base64_images must be a base64 string, a list of strings, or None.")
25 | 
26 |     if not all(isinstance(img, str) for img in base64_images):
27 |         raise ValueError("Each item in base64_images must be a string.")
28 | 
29 |     # --- Dispatch based on provider ---
30 |     if api_provider == "anthropic":
31 |         return anthropic_completion(system_prompt, model_name, base64_images, move_prompts)
32 | 
33 |     elif api_provider == "openai":
34 |         return openai_completion(system_prompt, model_name, base64_images, move_prompts)
35 | 
36 |     elif api_provider == "gemini":
37 |         return gemini_completion(system_prompt, model_name, base64_images, move_prompts)
38 | 
39 |     else:
40 |         raise NotImplementedError(f"Unsupported API provider: '{api_provider}'")


--------------------------------------------------------------------------------
/game_agent/gpt_operator/README.md:
--------------------------------------------------------------------------------
 1 | # **FlashAdventure: An Agent for Flash Game Environments - GPT Operator**
 2 | 
 3 | This guide covers the execution of the GPT Operator game agent developed for research on autonomous agents in Flash game environments.
 4 | 
 5 | All core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent.
 6 | 
 7 | -----
 8 | 
 9 | ### **1. Agent Execution**
10 | 
11 | #### **1.1 The Run Script (`run.bash`)**
12 | 
13 | First, navigate to the `game_agent/gpt_operator/` directory. Then, create a file named `run.bash` and add the following script. This script automatically handles loading game prompts and setting history parameters.
14 | 
15 | ```bash
16 | #!/bin/bash
17 | 
18 | PYTHON=python
19 | MAIN_SCRIPT="main.py"
20 | DEFAULT_PROMPT_FILE="game_prompts.json"
21 | HISTORY=10
22 | 
23 | echo "🚀 Starting agent automation loop..."
24 | 
25 | # Automatically use the tasks.json file if it exists
26 | if [ -f "$DEFAULT_PROMPT_FILE" ]; then
27 |     echo "📝 $DEFAULT_PROMPT_FILE detected, using prompt automatically"
28 |     $PYTHON $MAIN_SCRIPT "$DEFAULT_PROMPT_FILE" --history $HISTORY
29 | else
30 |     echo "📄 $DEFAULT_PROMPT_FILE not found, running with default prompt"
31 |     $PYTHON $MAIN_SCRIPT --history $HISTORY
32 | fi
33 | ```
34 | 
35 | #### **1.2 Run the Agent**
36 | 
37 | To start the agent, first grant execution permissions to the script and then run it from your terminal:
38 | 
39 | ```bash
40 | chmod +x run.bash  # (Run this once to grant permissions)
41 | ./run.bash
42 | ```
43 | 
44 | -----
45 | 
46 | ### **2. Execution Summary**
47 | 
48 | 1.  **Run Script:** Create and configure the `run.bash` script with your desired parameters.
49 | 2.  **Launch:** Execute `./run.bash` to start the agent.


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_idol.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_affection(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_idol():
17 |     game = "Idol Days Sim Date"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_affection(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Highest Experience Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "idol_days_sim_date",
45 |             "result": f"EXP: {affection}",
46 |             "EXP": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_idol.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_idol.json")
53 |     else:
54 |         print("⚠️ Couldn't find EXP score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_idol())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_pico.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_affection(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Affection:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_pico():
17 |     game = "Pico Sim Date"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_affection(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"💖 Affection Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "pico_sim_date",
45 |             "result": f"Affection: {affection}",
46 |             "affection": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_pico.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_pico.json")
53 |     else:
54 |         print("⚠️ Couldn't find affection score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_pico())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_grim1.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_item(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Found Items:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_grim1():
17 |     game = "Grim Tales: The Bride"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_item(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Highest Experience Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "grim_tales_the_bride",
45 |             "result": f"Items: {affection}",
46 |             "Items": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_grim1.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_grim1.json")
53 |     else:
54 |         print("⚠️ Couldn't find Found Items in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_grim1())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_kingdom.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_affection(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_kingdom():
17 |     game = "Kingdom Days"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_affection(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Highest Experience Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "kingdom_days",
45 |             "result": f"EXP: {affection}",
46 |             "EXP": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_kingdom.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_kingdom.json")
53 |     else:
54 |         print("⚠️ Couldn't find EXP score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_kingdom())


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/README.md:
--------------------------------------------------------------------------------
 1 | # **FlashAdventure: An Agent for Flash Game Environments - Claude Computer Use**
 2 | 
 3 | This guide covers the execution of the Claude Computer Use game agent developed for research on autonomous agents in Flash game environments.
 4 | 
 5 | All core setup, including creating the Python environment and configuring API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent.
 6 | 
 7 | -----
 8 | 
 9 | ### **1. Agent Execution**
10 | 
11 | #### **1.1 The Run Script (`run.bash`)**
12 | 
13 | First, navigate to the `game_agent/claude_computer_use/` directory. Create a file named `run.bash` and add the following script. This script defines key execution parameters and then launches the agent.
14 | 
15 | ```bash
16 | #!/bin/bash
17 | 
18 | # Ensure Python can find the agent modules in the parent directory
19 | export PYTHONPATH=$(pwd)/..
20 | 
21 | # Configuration for the agent
22 | # The file containing game-specific prompts
23 | TASK_FILE="./json/game_prompt.json"
24 | # The type of prompt to use from the file (e.g., "prompt")
25 | PROMPT_TYPE="prompt"
26 | # The maximum number of actions the agent will take in a single session
27 | MAX_ACTIONS=1000
28 | 
29 | # Execute the main agent script with the defined parameters
30 | python main.py \
31 |   --task-file "$TASK_FILE" \
32 |   --prompt-type "$PROMPT_TYPE" \
33 |   --max-actions "$MAX_ACTIONS"
34 | ```
35 | 
36 | #### **1.2 Run the Agent**
37 | 
38 | To start the agent, first grant execute permissions to the script and then run it from your terminal:
39 | 
40 | ```bash
41 | chmod +x run.bash  # (Run this once to grant permissions)
42 | ./run.bash
43 | ```
44 | 
45 | -----
46 | 
47 | ### **2. Execution Summary**
48 | 
49 | 1.  **Run Script:** Create and configure the `run.bash` script with your desired parameters.
50 | 2.  **Launch:** Execute `./run.bash` to start the agent.


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_college.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_affection(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Combined Score:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_college():
17 |     game = "Community College Sim"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_affection(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Combined Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "college_sim",
45 |             "result": f"Combined Score: {affection}",
46 |             "Combined Score": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_college.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_college.json")
53 |     else:
54 |         print("⚠️ Couldn't find Combined Score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_college())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_festival.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_affection(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Highest EXP:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_festival():
17 |     game = "Festival Days Sim Date"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_affection(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Highest Experience Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "festival_sim_date",
45 |             "result": f"EXP: {affection}",
46 |             "EXP": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_festival.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_festival.json")
53 |     else:
54 |         print("⚠️ Couldn't find affection score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_festival())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_smalltown.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_places(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Found Place:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_smalltown():
17 |     game = "Small Town Detective"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_places(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Found Place: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "small_town_detective",
45 |             "result": f"Found Place: {affection}",
46 |             "found_place": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_smalltown.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_smalltown.json")
53 |     else:
54 |         print("⚠️ Couldn't find affection score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_smalltown())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_grim2.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_item(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Found Items:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_grim2():
17 |     game = "Grim Tales: The Legacy Collector's Edition"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_item(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Highest Experience Score: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "grim_tales_the_legacy_collectors_edition",
45 |             "result": f"Items: {affection}",
46 |             "Items": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_grim2.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_grim2.json")
53 |     else:
54 |         print("⚠️ Couldn't find Found Items in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_grim2())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_nickbounty.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_places(conversation: List[str]) -> float | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Found Place:\s*(\d+(\.\d+)?)", msg, re.IGNORECASE)
12 |         if match:
13 |             return float(match.group(1))
14 |     return None
15 | 
16 | async def eval_nickbounty():
17 |     game = "Nick Bounty: A Case of the Crabs"
18 |     prompt_key = "milestone_prompt1"
19 | 
20 |     # Load prompt from milestone_prompts.json
21 |     with open("milestone_prompts.json", "r") as f:
22 |         all_data = json.load(f)
23 | 
24 |     data = all_data.get(game, {})
25 |     if not data or prompt_key not in data:
26 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
27 |         return
28 | 
29 |     print(f"🎮 Running evaluation for: {game}")
30 | 
31 |     # Run agent
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     affection = extract_places(conversation)
39 | 
40 |     if affection is not None:
41 |         print(f"Found Place: {affection}")
42 | 
43 |         result_obj = {
44 |             "game": "nick_bounty_a_case_of_the_crabs",
45 |             "result": f"Found Place: {affection}",
46 |             "found_place": affection
47 |         }
48 | 
49 |         Path("results").mkdir(exist_ok=True)
50 |         with open("results/result_nickbounty.json", "w") as f:
51 |             json.dump(result_obj, f, indent=2)
52 |         print("📝 Result saved to results/result_nickbounty.json")
53 |     else:
54 |         print("⚠️ Couldn't find affection score in result.")
55 | 
56 | if __name__ == "__main__":
57 |     asyncio.run(eval_nickbounty())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_sherlock.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from eval_game.eval_utils import run_milestone
 5 | from pathlib import Path
 6 | from typing import List
 7 | 
 8 | def extract_result(conversation: List[str]) -> str | None:
 9 |     for msg in reversed(conversation):
10 |         if "New Suspect:" in msg:
11 |             return msg.strip()
12 |     return None
13 | 
14 | def get_suspect_count(result_text: str) -> int:
15 |     match = re.search(r"New Suspect:\s*(\d+)", result_text, re.IGNORECASE)
16 |     return int(match.group(1)) if match else 0
17 | 
18 | async def eval_sherlock():
19 |     game = "sherlock_holmes_the_tea_shop_murder_mystery"
20 |     prompt_key = "milestone_prompt1"
21 | 
22 |     with open("milestone_prompts.json", "r") as f:
23 |         all_data = json.load(f)
24 | 
25 |     data = all_data.get(game, {})
26 |     if not data:
27 |         print(f"❌ No data found for game: {game}")
28 |         return
29 | 
30 |     print(f"🕵️ Running evaluation for: Sherlock Holmes")
31 | 
32 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
33 | 
34 |     if conversation is None:
35 |         print("⚠️ No conversation returned from agent.")
36 |         return
37 | 
38 |     result_text = extract_result(conversation)
39 | 
40 |     if result_text:
41 |         count = get_suspect_count(result_text)
42 |         print(f"🔍 Final Result: {result_text}")
43 |         print(f"🧮 Counted Suspects: {count}")
44 | 
45 |         # ✅ Save Result
46 |         result_obj = {
47 |             "game": "sherlock",
48 |             "result": result_text,
49 |             "#New Suspects": count
50 |         }
51 | 
52 |         Path("results").mkdir(exist_ok=True)
53 |         with open("results/result_sherlock.json", "w") as f:
54 |             json.dump(result_obj, f, indent=2)
55 |         print("📝 Result saved to results/result_sherlock.json")
56 | 
57 |     else:
58 |         print("⚠️ No 'New Suspect' result found.")
59 | 
60 | if __name__ == "__main__":
61 |     asyncio.run(eval_sherlock())


--------------------------------------------------------------------------------
/game_agent/UI-Tars/tools/screenshot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mss
 3 | 
 4 | def get_screenshot_dir(base_dir, cua, model_name, game_name):
 5 |     """Creates a directory based on game/model/agent."""
 6 |     directory = os.path.join(base_dir, cua, model_name, game_name)
 7 |     os.makedirs(directory, exist_ok=True)
 8 |     return directory
 9 | 
10 | def get_next_screenshot_filename(directory):
11 |     """Generates the next sequential screenshot filename in the given directory."""
12 |     existing_files = [
13 |         f for f in os.listdir(directory)
14 |         if f.startswith("flash_screenshot_") and f.endswith(".png")
15 |     ]
16 | 
17 |     numbers = []
18 |     for filename in existing_files:
19 |         try:
20 |             num_str = filename.replace("flash_screenshot_", "").replace(".png", "")
21 |             numbers.append(int(num_str))
22 |         except ValueError:
23 |             continue
24 | 
25 |     next_num = max(numbers, default=0) + 1
26 |     return f"flash_screenshot_{next_num:04d}.png"
27 | 
28 | def capture_flash_screenshot(game_name, cua, model_name, time=None):
29 |     """
30 |     Captures the entire screen and saves it to a folder based on GUI agent / model.
31 |     - time=None or "": screenshots/
32 |     - time="after": screenshots_after/
33 |     - time="final": screenshots_final/
34 |     """
35 |     if time not in (None, "", "after", "final"):
36 |         raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.")
37 | 
38 |     if time == "after":
39 |         base_dir = "screenshots_after"
40 |     elif time == "final":
41 |         base_dir = "screenshots_final"
42 |     else:
43 |         base_dir = "screenshots"
44 | 
45 |     directory = get_screenshot_dir(base_dir, cua, model_name, game_name)
46 |     filename = get_next_screenshot_filename(directory)
47 |     screenshot_path = os.path.join(directory, filename)
48 | 
49 |     with mss.mss() as sct:
50 |         monitor = sct.monitors[1]
51 |         screenshot = sct.grab(monitor)
52 |         mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path)
53 | 
54 |     print(f"[INFO] Screenshot saved to: {screenshot_path}")
55 |     return screenshot_path


--------------------------------------------------------------------------------
/game_agent/cradle/tools/screenshot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mss
 3 | 
 4 | def get_screenshot_dir(base_dir, cua, model_name, game_name):
 5 |     """Create directory based on game / model / agent"""
 6 |     directory = os.path.join(base_dir, cua, model_name, game_name)
 7 |     os.makedirs(directory, exist_ok=True)
 8 |     return directory
 9 | 
10 | def get_next_screenshot_filename(directory):
11 |     """Generate the next sequential screenshot filename in the given directory"""
12 |     existing_files = [
13 |         f for f in os.listdir(directory)
14 |         if f.startswith("flash_screenshot_") and f.endswith(".png")
15 |     ]
16 | 
17 |     numbers = []
18 |     for filename in existing_files:
19 |         try:
20 |             num_str = filename.replace("flash_screenshot_", "").replace(".png", "")
21 |             numbers.append(int(num_str))
22 |         except ValueError:
23 |             continue
24 | 
25 |     next_num = max(numbers, default=0) + 1
26 |     return f"flash_screenshot_{next_num:04d}.png"
27 | 
28 | def capture_flash_screenshot(game_name, cua, model_name, time=None):
29 |     """
30 |     Capture the full screen and save it into a folder structured by GUI agent / model.
31 |     - time=None or "": screenshots/
32 |     - time="after": screenshots_after/
33 |     - time="final": screenshots_final/
34 |     """
35 |     if time not in (None, "", "after", "final"):
36 |         raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.")
37 | 
38 |     if time == "after":
39 |         base_dir = "screenshots_after"
40 |     elif time == "final":
41 |         base_dir = "screenshots_final"
42 |     else:
43 |         base_dir = "screenshots"
44 | 
45 |     directory = get_screenshot_dir(base_dir, cua, model_name, game_name)
46 |     filename = get_next_screenshot_filename(directory)
47 |     screenshot_path = os.path.join(directory, filename)
48 | 
49 |     with mss.mss() as sct:
50 |         monitor = sct.monitors[1]
51 |         screenshot = sct.grab(monitor)
52 |         mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path)
53 | 
54 |     print(f"[INFO] Screenshot saved to: {screenshot_path}")
55 |     return screenshot_path


--------------------------------------------------------------------------------
/evaluator/judge/vlm/screenshot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import mss
 4 | import time
 5 | import subprocess
 6 | 
 7 | 
 8 | # OS 감지
 9 | IS_MAC = platform.system() == "Darwin"
10 | 
11 | # 🔹 스크린샷 저장 디렉토리 설정 (기본: screenshots/flashpoint/)
12 | SCREENSHOT_DIR = "screenshots/flashpoint/"
13 | 
14 | # 저장 경로 디렉토리 생성
15 | os.makedirs(SCREENSHOT_DIR, exist_ok=True)
16 | 
17 | 
18 | def get_flashpoint_window_position():
19 |     print(f"[INFO] Detecting Flashpoint window on {platform.system()}...")
20 |     
21 |     if IS_MAC:
22 |         script = '''
23 |         tell application "System Events"
24 |             set window_list to name of every window of every process whose visible is true
25 |         end tell
26 |         return window_list
27 |         '''
28 |         result = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
29 |         windows = result.stdout.strip().split(", ")
30 |         for window in windows:
31 |             if "Flashpoint" in window:
32 |                 return 100, 100, 800, 600  # 기본값
33 |     else:
34 |         try:
35 |             import pygetwindow as gw
36 |             windows = gw.getWindowsWithTitle("Flashpoint")
37 |             if windows:
38 |                 window = windows[0]
39 |                 return window.left, window.top, window.width, window.height
40 |         except ImportError:
41 |             print("[ERROR] pygetwindow is not installed. Run: pip install pygetwindow")
42 |     return None
43 | 
44 | def capture_flash_screenshot():
45 |     position = get_flashpoint_window_position()
46 |     timestamp = time.strftime("%Y%m%d_%H%M%S")
47 |     screenshot_path = os.path.join(SCREENSHOT_DIR, f"flash_screenshot_{timestamp}.png")
48 |     
49 |     with mss.mss() as sct:
50 |         if position:
51 |             left, top, width, height = position
52 |             monitor = {"top": top, "left": left, "width": width, "height": height}
53 |         else:
54 |             monitor = sct.monitors[1]
55 |         
56 |         screenshot = sct.grab(monitor)
57 |         mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path)
58 |         print(f"[INFO] Screenshot saved: {screenshot_path}")
59 |     
60 |     return screenshot_path
61 | 


--------------------------------------------------------------------------------
/game_agent/coast/tools/screenshot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import mss
 3 | 
 4 | def get_screenshot_dir(base_dir, reasoning_model, gui_agent, game_name):
 5 |     """Creates a directory based on game/model/agent."""
 6 |     directory = os.path.join(base_dir, gui_agent, reasoning_model, game_name)
 7 |     os.makedirs(directory, exist_ok=True)
 8 |     return directory
 9 | 
10 | def get_next_screenshot_filename(directory):
11 |     """Generates the next sequential screenshot filename in the given directory."""
12 |     existing_files = [
13 |         f for f in os.listdir(directory)
14 |         if f.startswith("flash_screenshot_") and f.endswith(".png")
15 |     ]
16 | 
17 |     numbers = []
18 |     for filename in existing_files:
19 |         try:
20 |             num_str = filename.replace("flash_screenshot_", "").replace(".png", "")
21 |             numbers.append(int(num_str))
22 |         except ValueError:
23 |             continue
24 | 
25 |     next_num = max(numbers, default=0) + 1
26 |     return f"flash_screenshot_{next_num:04d}.png"
27 | 
28 | def capture_flash_screenshot(game_name, gui_model, reasoning_model, time=None):
29 |     """
30 |     Captures the entire screen and saves it to a folder based on GUI agent / model.
31 |     - time=None or "": screenshots/
32 |     - time="after": screenshots_after/
33 |     - time="final": screenshots_final/
34 |     """
35 |     if time not in (None, "", "after", "final"):
36 |         raise ValueError("Invalid value for 'time'. Use 'after', 'final', or leave it empty.")
37 | 
38 |     if time == "after":
39 |         base_dir = "screenshots_after"
40 |     elif time == "final":
41 |         base_dir = "screenshots_final"
42 |     else:
43 |         base_dir = "screenshots"
44 | 
45 |     directory = get_screenshot_dir(base_dir, gui_model, reasoning_model, game_name)
46 |     filename = get_next_screenshot_filename(directory)
47 |     screenshot_path = os.path.join(directory, filename)
48 | 
49 |     with mss.mss() as sct:
50 |         monitor = sct.monitors[1]
51 |         screenshot = sct.grab(monitor)
52 |         mss.tools.to_png(screenshot.rgb, screenshot.size, output=screenshot_path)
53 | 
54 |     print(f"[INFO] Screenshot saved to: {screenshot_path}")
55 |     return screenshot_path


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_videostudio.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List, Optional
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | GREEN_RE = re.compile(r"Green\s*Lights\s*:\s*(\d+(?:\.\d+)?)", re.IGNORECASE)
10 | 
11 | def extract_green_lights(conversation: List[str]) -> Optional[float]:
12 |     for msg in reversed(conversation or []):
13 |         m = GREEN_RE.search(msg)
14 |         if m:
15 |             return float(m.group(1))
16 |     return None
17 | 
18 | async def eval_videostudio():
19 |     game = "Video Studio Escape"
20 | 
21 |     # Load prompt from milestone_prompts.json
22 |     with open("milestone_prompts.json", "r", encoding="utf-8") as f:
23 |         all_data = json.load(f)
24 | 
25 |     data = all_data.get(game)
26 |     if not data:
27 |         print(f"❌ No data found for game: {game}")
28 |         return
29 | 
30 |     print(f"🎮 Running evaluation for: {game}")
31 | 
32 |     # 1) Instruction
33 |     if "Instruction" in data:
34 |         _ = await run_milestone(data["Instruction"], f"{game}_Instruction")
35 | 
36 |     # 2) milestone_prompt1
37 |     if "milestone_prompt1" not in data:
38 |         print(f"❌ No milestone data found for {game} / milestone_prompt1")
39 |         return
40 | 
41 |     conversation = await run_milestone(data["milestone_prompt1"], f"{game}_milestone_prompt1")
42 |     if conversation is None:
43 |         print("⚠️ No conversation returned from agent.")
44 |         return
45 | 
46 |     green = extract_green_lights(conversation)
47 |     if green is not None:
48 |         print(f"Green Lights: {green}")
49 | 
50 |         result_obj = {
51 |             "game": "video_studio_escape",
52 |             "result": f"green_lights: {green}",
53 |             "green_lights": green,
54 |         }
55 | 
56 |         Path("results").mkdir(exist_ok=True)
57 |         with open("results/result_videostudio.json", "w", encoding="utf-8") as f:
58 |             json.dump(result_obj, f, indent=2, ensure_ascii=False)
59 |         print("📝 Result saved to results/result_videostudio.json")
60 |     else:
61 |         print("⚠️ Couldn't find green_lights in result.")
62 | 
63 | if __name__ == "__main__":
64 |     asyncio.run(eval_videostudio())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_paint.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List, Optional
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | DOOR_INDEX_RE = re.compile(r"Door\s*Index\s*:\s*(\d+(?:\.\d+)?)", re.IGNORECASE)
10 | 
11 | def extract_door_index(conversation: List[str]) -> Optional[float]:
12 |     for msg in reversed(conversation or []):
13 |         m = DOOR_INDEX_RE.search(msg)
14 |         if m:
15 |             return float(m.group(1))
16 |     return None
17 | 
18 | async def eval_paint():
19 |     game = "Paint Room Escape"
20 | 
21 |     # Load prompt from milestone_prompts.json
22 |     with open("milestone_prompts.json", "r", encoding="utf-8") as f:
23 |         all_data = json.load(f)
24 | 
25 |     data = all_data.get(game, {})
26 |     if not data:
27 |         print(f"❌ No data found for game: {game}")
28 |         return
29 | 
30 |     print(f"🎮 Running evaluation for: {game}")
31 | 
32 |     # 1) Instruction 
33 |     if "Instruction" in data:
34 |         _ = await run_milestone(data["Instruction"], f"{game}_Instruction")
35 | 
36 |     # 2) milestone_prompt1 
37 |     prompt_key = "milestone_prompt1"
38 |     if prompt_key not in data:
39 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
40 |         return
41 | 
42 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
43 |     if conversation is None:
44 |         print("⚠️ No conversation returned from agent.")
45 |         return
46 | 
47 |     door_index = extract_door_index(conversation)
48 | 
49 |     if door_index is not None:
50 |         print(f"Door color index: {door_index}")
51 | 
52 |         result_obj = {
53 |             "game": "paint_room_escape",
54 |             "result": f"door_color_index: {door_index}",
55 |             "door_color_index": door_index,
56 |         }
57 | 
58 |         Path("results").mkdir(exist_ok=True)
59 |         with open("results/result_paint.json", "w", encoding="utf-8") as f:
60 |             json.dump(result_obj, f, indent=2, ensure_ascii=False)
61 |         print("📝 Result saved to results/result_paint.json")
62 |     else:
63 |         print("⚠️ Couldn't find door_color_index in result.")
64 | 
65 | if __name__ == "__main__":
66 |     asyncio.run(eval_paint())


--------------------------------------------------------------------------------
/game_agent/UI-Tars/mm_agents/README.md:
--------------------------------------------------------------------------------
 1 | # Agent
 2 | ## Prompt-based Agents
 3 | 
 4 | ### Supported Models
 5 | We currently support the following models as the foundational models for the agents:
 6 | - `GPT-3.5` (gpt-3.5-turbo-16k, ...)
 7 | - `GPT-4` (gpt-4-0125-preview, gpt-4-1106-preview, ...)
 8 | - `GPT-4V` (gpt-4-vision-preview, ...)
 9 | - `Gemini-Pro`
10 | - `Gemini-Pro-Vision`
11 | - `Claude-3, 2` (claude-3-haiku-2024030, claude-3-sonnet-2024022, ...)
12 | - ...
13 | 
14 | And those from the open-source community:
15 | - `Mixtral 8x7B`
16 | - `QWEN`, `QWEN-VL`
17 | - `CogAgent`
18 | - `Llama3`
19 | - ...
20 | 
21 | In the future, we will integrate and support more foundational models to enhance digital agents, so stay tuned.
22 | 
23 | ### How to use
24 | 
25 | ```python
26 | from mm_agents.agent import PromptAgent
27 | 
28 | agent = PromptAgent(
29 |     model="gpt-4-vision-preview",
30 |     observation_type="screenshot",
31 | )
32 | agent.reset()
33 | # say we have an instruction and observation
34 | instruction = "Please help me to find the nearest restaurant."
35 | obs = {"screenshot": open("path/to/observation.jpg", 'rb').read()}
36 | response, actions = agent.predict(
37 |     instruction,
38 |     obs
39 | )
40 | ```
41 | 
42 | ### Observation Space and Action Space
43 | We currently support the following observation spaces:
44 | - `a11y_tree`: the accessibility tree of the current screen
45 | - `screenshot`: a screenshot of the current screen
46 | - `screenshot_a11y_tree`: a screenshot of the current screen with the accessibility tree overlay
47 | - `som`: the set-of-mark trick on the current screen, with table metadata included.
48 | 
49 | And the following action spaces:
50 | - `pyautogui`: valid Python code with `pyautogui` code valid
51 | - `computer_13`: a set of enumerated actions designed by us
52 | 
53 | To feed an observation into the agent, you have to maintain the `obs` variable as a dict with the corresponding information:
54 | ```python
55 | # continue from the previous code snippet
56 | obs = {
57 |     "screenshot": open("path/to/observation.jpg", 'rb').read(),
58 |     "a11y_tree": ""  # [a11y_tree data]
59 | }
60 | response, actions = agent.predict(
61 |     instruction,
62 |     obs
63 | )
64 | ```
65 | 
66 | ## Efficient Agents, Q* Agents, and more
67 | Stay tuned for more updates.
68 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/README.md:
--------------------------------------------------------------------------------
 1 | # **FlashAdventure: An Agent for Flash Game Environments - UI-TARS**
 2 | 
 3 | This guide covers the execution of the UI-TARS game agent, which is based on the OSWorld source code. Developed for our research on autonomous agents in Flash game environments, this guide details the steps required to configure and run the agent.
 4 | 
 5 | The core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on the UI-TARS agent.
 6 | 
 7 | ## **1. Agent Execution**
 8 | 
 9 | ### **1.1 Launch the VLLM Model Server**
10 | 
11 | The UI-TARS agent requires a VLLM model server to be running. First, launch the model server on a remote GPU machine using the following command:
12 | 
13 | ```bash
14 | python -m vllm.entrypoints.openai.api_server --served-model-name ui-tars --model "ByteDance-Seed/UI-TARS-1.5-7B" --limit-mm-per-prompt image=10 --port 8000
15 | ```
16 | 
17 | ### **1.2 Configure the Agent's API Endpoint**
18 | 
19 | Next, you must configure the agent to connect to your running server. Open the `mm_agents/uitars_agent.py` file and modify the `base_url` to point to your server's address.
20 | 
21 | ```python
22 | ## Line 574
23 | self.vlm = OpenAI(
24 |     base_url="http://your_server_url:8000/v1", api_key="empty"
25 | )
26 | ```
27 | 
28 | ### **1.3 The Run Script (`run.bash`)**
29 | 
30 | Now, create a `run.bash` file in the `game_agent/UI-Tars/` directory with the following content. This simple script will execute the main entry point for the agent.
31 | 
32 | ```bash
33 | #!/bin/bash
34 | 
35 | # Execute the main agent script
36 | python run_uitars.py
37 | ```
38 | 
39 | ### **1.4 Run the Agent**
40 | 
41 | To start the agent, first grant execution permissions to the script and then run it from your terminal:
42 | 
43 | ```bash
44 | chmod +x run.bash  # (Run this once to grant permissions)
45 | ./run.bash
46 | ```
47 | 
48 | -----
49 | 
50 | ### **2. Execution Summary**
51 | 
52 | 1.  **Load Model:** Launch the UI-Tars model server on your remote GPU.
53 | 2.  **Configure Agent:** Update the `base_url` in `mm_agents/uitars_agent.py` to point to your server's address.
54 | 3.  **Run Script:** Create and configure the `run.bash` script with your desired parameters.
55 | 4.  **Launch:** Execute `./run.bash` to start the agent.


--------------------------------------------------------------------------------
/game_agent/cradle/README.md:
--------------------------------------------------------------------------------
 1 | I will gladly help you polish your `README` file. The current draft is a bit repetitive, as it has two separate sections (`1.1 The Run Script` and `1.2 Configuration`) that contain similar information about the agent's configuration.
 2 | 
 3 | I will streamline this by combining all the configuration details into a single `run.bash` script. This makes the setup process much more direct and easier to follow, which is perfect for a research paper's documentation.
 4 | 
 5 | -----
 6 | 
 7 | # **FlashAdventure: An Agent for Flash Game Environments - Cradle**
 8 | 
 9 | This project presents a reproduction of the `cradle` game agent, adapted for our research on autonomous agents in Flash game environments. This guide details the steps required to configure and run the agent.
10 | 
11 | All core setup, including the Python environment and API keys, is consistent with the main `README` in the parent directory. This guide focuses specifically on running this agent.
12 | 
13 | ## **1. Agent Execution**
14 | 
15 | ### **1.1 The Run Script (`run.bash`)**
16 | 
17 | First, navigate to the `game_agent/cradle/` directory. Then, create a `run.bash` file and add the following script. You can directly edit the variables in this script to configure the agent's behavior.
18 | 
19 | ```bash
20 | #!/bin/bash
21 | 
22 | # Configuration
23 | # API Provider: Select one of "anthropic" or "openai".
24 | PROVIDER="anthropic"
25 | 
26 | # LLM Model: Specifies the model for high-level reasoning.
27 | MODEL="claude-3-7-sonnet-20250219"
28 | 
29 | # GUI Agent Type: Select the agent responsible for mouse/keyboard operations.
30 | # "claude": A Claude agent specialized for computer control.
31 | # "sonnet": An agent that uses the original Claude Sonnet model directly.
32 | # "uground": An open-source UGround model.
33 | CUA="claude"
34 | 
35 | # Execute
36 | python main.py --model "$MODEL" --provider "$PROVIDER" --cua "$CUA"
37 | ```
38 | 
39 | ### **1.2 Run the Agent**
40 | 
41 | To start the agent, grant execute permissions and run the script from your terminal:
42 | 
43 | ```bash
44 | chmod +x run.bash  # (Run this once)
45 | ./run.bash
46 | ```
47 | 
48 | -----
49 | 
50 | ## **2. Execution Summary**
51 | 
52 | 1.  **Run Script:** Create and configure `run.bash` with your desired parameters.
53 | 2.  **Launch:** Execute `./run.bash` to start the agent.


--------------------------------------------------------------------------------
/game_agent/cradle/claude_cua/tools/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | from dataclasses import dataclass, fields, replace
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | 
 8 | class BaseAnthropicTool(metaclass=ABCMeta):
 9 |     """Abstract base class for Anthropic-defined tools."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, **kwargs) -> Any:
13 |         """Executes the tool with the given arguments."""
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def to_params(
18 |         self,
19 |     ) -> BetaToolUnionParam:
20 |         raise NotImplementedError
21 | 
22 | 
23 | @dataclass(kw_only=True, frozen=True)
24 | class ToolResult:
25 |     """Represents the result of a tool execution."""
26 | 
27 |     output: str | None = None
28 |     error: str | None = None
29 |     base64_image: str | None = None
30 |     system: str | None = None
31 | 
32 |     def __bool__(self):
33 |         return any(getattr(self, field.name) for field in fields(self))
34 | 
35 |     def __add__(self, other: "ToolResult"):
36 |         def combine_fields(
37 |             field: str | None, other_field: str | None, concatenate: bool = True
38 |         ):
39 |             if field and other_field:
40 |                 if concatenate:
41 |                     return field + other_field
42 |                 raise ValueError("Cannot combine tool results")
43 |             return field or other_field
44 | 
45 |         return ToolResult(
46 |             output=combine_fields(self.output, other.output),
47 |             error=combine_fields(self.error, other.error),
48 |             base64_image=combine_fields(self.base64_image, other.base64_image, False),
49 |             system=combine_fields(self.system, other.system),
50 |         )
51 | 
52 |     def replace(self, **kwargs):
53 |         """Returns a new ToolResult with the given fields replaced."""
54 |         return replace(self, **kwargs)
55 | 
56 | 
57 | class CLIResult(ToolResult):
58 |     """A ToolResult that can be rendered as a CLI output."""
59 | 
60 | 
61 | class ToolFailure(ToolResult):
62 |     """A ToolResult that represents a failure."""
63 | 
64 | 
65 | class ToolError(Exception):
66 |     """Raised when a tool encounters an error."""
67 | 
68 |     def __init__(self, message):
69 |         self.message = message


--------------------------------------------------------------------------------
/evaluator/judge/computer_use/tools/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | from dataclasses import dataclass, fields, replace
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | 
 8 | class BaseAnthropicTool(metaclass=ABCMeta):
 9 |     """Abstract base class for Anthropic-defined tools."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, **kwargs) -> Any:
13 |         """Executes the tool with the given arguments."""
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def to_params(
18 |         self,
19 |     ) -> BetaToolUnionParam:
20 |         raise NotImplementedError
21 | 
22 | 
23 | @dataclass(kw_only=True, frozen=True)
24 | class ToolResult:
25 |     """Represents the result of a tool execution."""
26 | 
27 |     output: str | None = None
28 |     error: str | None = None
29 |     base64_image: str | None = None
30 |     system: str | None = None
31 | 
32 |     def __bool__(self):
33 |         return any(getattr(self, field.name) for field in fields(self))
34 | 
35 |     def __add__(self, other: "ToolResult"):
36 |         def combine_fields(
37 |             field: str | None, other_field: str | None, concatenate: bool = True
38 |         ):
39 |             if field and other_field:
40 |                 if concatenate:
41 |                     return field + other_field
42 |                 raise ValueError("Cannot combine tool results")
43 |             return field or other_field
44 | 
45 |         return ToolResult(
46 |             output=combine_fields(self.output, other.output),
47 |             error=combine_fields(self.error, other.error),
48 |             base64_image=combine_fields(self.base64_image, other.base64_image, False),
49 |             system=combine_fields(self.system, other.system),
50 |         )
51 | 
52 |     def replace(self, **kwargs):
53 |         """Returns a new ToolResult with the given fields replaced."""
54 |         return replace(self, **kwargs)
55 | 
56 | 
57 | class CLIResult(ToolResult):
58 |     """A ToolResult that can be rendered as a CLI output."""
59 | 
60 | 
61 | class ToolFailure(ToolResult):
62 |     """A ToolResult that represents a failure."""
63 | 
64 | 
65 | class ToolError(Exception):
66 |     """Raised when a tool encounters an error."""
67 | 
68 |     def __init__(self, message):
69 |         self.message = message
70 | 


--------------------------------------------------------------------------------
/game_agent/claude_computer_use/tools/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | from dataclasses import dataclass, fields, replace
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | 
 8 | class BaseAnthropicTool(metaclass=ABCMeta):
 9 |     """Abstract base class for Anthropic-defined tools."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, **kwargs) -> Any:
13 |         """Executes the tool with the given arguments."""
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def to_params(
18 |         self,
19 |     ) -> BetaToolUnionParam:
20 |         raise NotImplementedError
21 | 
22 | 
23 | @dataclass(kw_only=True, frozen=True)
24 | class ToolResult:
25 |     """Represents the result of a tool execution."""
26 | 
27 |     output: str | None = None
28 |     error: str | None = None
29 |     base64_image: str | None = None
30 |     system: str | None = None
31 | 
32 |     def __bool__(self):
33 |         return any(getattr(self, field.name) for field in fields(self))
34 | 
35 |     def __add__(self, other: "ToolResult"):
36 |         def combine_fields(
37 |             field: str | None, other_field: str | None, concatenate: bool = True
38 |         ):
39 |             if field and other_field:
40 |                 if concatenate:
41 |                     return field + other_field
42 |                 raise ValueError("Cannot combine tool results")
43 |             return field or other_field
44 | 
45 |         return ToolResult(
46 |             output=combine_fields(self.output, other.output),
47 |             error=combine_fields(self.error, other.error),
48 |             base64_image=combine_fields(self.base64_image, other.base64_image, False),
49 |             system=combine_fields(self.system, other.system),
50 |         )
51 | 
52 |     def replace(self, **kwargs):
53 |         """Returns a new ToolResult with the given fields replaced."""
54 |         return replace(self, **kwargs)
55 | 
56 | 
57 | class CLIResult(ToolResult):
58 |     """A ToolResult that can be rendered as a CLI output."""
59 | 
60 | 
61 | class ToolFailure(ToolResult):
62 |     """A ToolResult that represents a failure."""
63 | 
64 | 
65 | class ToolError(Exception):
66 |     """Raised when a tool encounters an error."""
67 | 
68 |     def __init__(self, message):
69 |         self.message = message
70 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/claude_cua/tools/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | from dataclasses import dataclass, fields, replace
 3 | from typing import Any
 4 | 
 5 | from anthropic.types.beta import BetaToolUnionParam
 6 | 
 7 | 
 8 | class BaseAnthropicTool(metaclass=ABCMeta):
 9 |     """Abstract base class for Anthropic-defined tools."""
10 | 
11 |     @abstractmethod
12 |     def __call__(self, **kwargs) -> Any:
13 |         """Executes the tool with the given arguments."""
14 |         ...
15 | 
16 |     @abstractmethod
17 |     def to_params(
18 |         self,
19 |     ) -> BetaToolUnionParam:
20 |         raise NotImplementedError
21 | 
22 | 
23 | @dataclass(kw_only=True, frozen=True)
24 | class ToolResult:
25 |     """Represents the result of a tool execution."""
26 | 
27 |     output: str | None = None
28 |     error: str | None = None
29 |     base64_image: str | None = None
30 |     system: str | None = None
31 | 
32 |     def __bool__(self):
33 |         return any(getattr(self, field.name) for field in fields(self))
34 | 
35 |     def __add__(self, other: "ToolResult"):
36 |         def combine_fields(
37 |             field: str | None, other_field: str | None, concatenate: bool = True
38 |         ):
39 |             if field and other_field:
40 |                 if concatenate:
41 |                     return field + other_field
42 |                 raise ValueError("Cannot combine tool results")
43 |             return field or other_field
44 | 
45 |         return ToolResult(
46 |             output=combine_fields(self.output, other.output),
47 |             error=combine_fields(self.error, other.error),
48 |             base64_image=combine_fields(self.base64_image, other.base64_image, False),
49 |             system=combine_fields(self.system, other.system),
50 |         )
51 | 
52 |     def replace(self, **kwargs):
53 |         """Returns a new ToolResult with the given fields replaced."""
54 |         return replace(self, **kwargs)
55 | 
56 | 
57 | class CLIResult(ToolResult):
58 |     """A ToolResult that can be rendered as a CLI output."""
59 | 
60 | 
61 | class ToolFailure(ToolResult):
62 |     """A ToolResult that represents a failure."""
63 | 
64 | 
65 | class ToolError(Exception):
66 |     """Raised when a tool encounters an error."""
67 | 
68 |     def __init__(self, message):
69 |         self.message = message
70 | 


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from dotenv import load_dotenv
 4 | import json
 5 | import base64
 6 | from PIL import Image
 7 | from io import BytesIO
 8 | import io
 9 | from urllib.parse import urlparse
10 | 
11 | load_dotenv(override=True)
12 | 
13 | BLOCKED_DOMAINS = [
14 |     "maliciousbook.com",
15 |     "evilvideos.com",
16 |     "darkwebforum.com",
17 |     "shadytok.com",
18 |     "suspiciouspins.com",
19 |     "ilanbigio.com",
20 | ]
21 | 
22 | 
23 | def pp(obj):
24 |     print(json.dumps(obj, indent=4))
25 | 
26 | 
27 | def show_image(base_64_image):
28 |     image_data = base64.b64decode(base_64_image)
29 |     image = Image.open(BytesIO(image_data))
30 |     image.show()
31 | 
32 | 
33 | def calculate_image_dimensions(base_64_image):
34 |     image_data = base64.b64decode(base_64_image)
35 |     image = Image.open(io.BytesIO(image_data))
36 |     return image.size
37 | 
38 | 
39 | def sanitize_message(msg: dict) -> dict:
40 |     """Return a copy of the message with image_url omitted for computer_call_output messages."""
41 |     if msg.get("type") == "computer_call_output":
42 |         output = msg.get("output", {})
43 |         if isinstance(output, dict):
44 |             sanitized = msg.copy()
45 |             sanitized["output"] = {**output, "image_url": "[omitted]"}
46 |             return sanitized
47 |     return msg
48 | 
49 | 
50 | def create_response(**kwargs):
51 |     url = "https://api.openai.com/v1/responses"
52 |     headers = {
53 |         "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
54 |         "Content-Type": "application/json"
55 |     }
56 | 
57 |     openai_org = os.getenv("OPENAI_ORG")
58 |     if openai_org:
59 |         headers["Openai-Organization"] = openai_org
60 | 
61 |     response = requests.post(url, headers=headers, json=kwargs)
62 | 
63 |     if response.status_code != 200:
64 |         print(f"Error: {response.status_code} {response.text}")
65 | 
66 |     return response.json()
67 | 
68 | 
69 | def check_blocklisted_url(url: str) -> None:
70 |     """Raise ValueError if the given URL (including subdomains) is in the blocklist."""
71 |     hostname = urlparse(url).hostname or ""
72 |     if any(
73 |         hostname == blocked or hostname.endswith(f".{blocked}")
74 |         for blocked in BLOCKED_DOMAINS
75 |     ):
76 |         raise ValueError(f"Blocked URL: {url}")
77 | 


--------------------------------------------------------------------------------
/game_agent/gpt_operator/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from dotenv import load_dotenv
 4 | import json
 5 | import base64
 6 | from PIL import Image
 7 | from io import BytesIO
 8 | import io
 9 | from urllib.parse import urlparse
10 | 
11 | load_dotenv(override=True)
12 | 
13 | BLOCKED_DOMAINS = [
14 |     "maliciousbook.com",
15 |     "evilvideos.com",
16 |     "darkwebforum.com",
17 |     "shadytok.com",
18 |     "suspiciouspins.com",
19 |     "ilanbigio.com",
20 | ]
21 | 
22 | 
23 | def pp(obj):
24 |     print(json.dumps(obj, indent=4))
25 | 
26 | 
27 | def show_image(base_64_image):
28 |     image_data = base64.b64decode(base_64_image)
29 |     image = Image.open(BytesIO(image_data))
30 |     image.show()
31 | 
32 | 
33 | def calculate_image_dimensions(base_64_image):
34 |     image_data = base64.b64decode(base_64_image)
35 |     image = Image.open(io.BytesIO(image_data))
36 |     return image.size
37 | 
38 | 
39 | def sanitize_message(msg: dict) -> dict:
40 |     """Return a copy of the message with image_url omitted for computer_call_output messages."""
41 |     if msg.get("type") == "computer_call_output":
42 |         output = msg.get("output", {})
43 |         if isinstance(output, dict):
44 |             sanitized = msg.copy()
45 |             sanitized["output"] = {**output, "image_url": "[omitted]"}
46 |             return sanitized
47 |     return msg
48 | 
49 | 
50 | def create_response(**kwargs):
51 |     url = "https://api.openai.com/v1/responses"
52 |     headers = {
53 |         "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
54 |         "Content-Type": "application/json"
55 |     }
56 | 
57 |     openai_org = os.getenv("OPENAI_ORG")
58 |     if openai_org:
59 |         headers["Openai-Organization"] = openai_org
60 | 
61 |     response = requests.post(url, headers=headers, json=kwargs)
62 | 
63 |     if response.status_code != 200:
64 |         print(f"Error: {response.status_code} {response.text}")
65 | 
66 |     return response.json()
67 | 
68 | 
69 | def check_blocklisted_url(url: str) -> None:
70 |     """Raise ValueError if the given URL (including subdomains) is in the blocklist."""
71 |     hostname = urlparse(url).hostname or ""
72 |     if any(
73 |         hostname == blocked or hostname.endswith(f".{blocked}")
74 |         for blocked in BLOCKED_DOMAINS
75 |     ):
76 |         raise ValueError(f"Blocked URL: {url}")
77 | 


--------------------------------------------------------------------------------
/evaluator/evaluate_game.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import asyncio
 3 | import os
 4 | 
 5 | AVAILABLE_EVALS = {
 6 |     "sherlock": "eval_sherlock",
 7 |     "sherlock2": "eval_sherlock2",
 8 |     "small town detective": "eval_smalltown",
 9 |     "nick bounty a case of the crabs": "eval_nickbounty",
10 |     "gamecafe": "eval_gamecafe",
11 |     "paint room escape": "eval_paint",
12 |     "video studio escape": "eval_videostudio",
13 |     "vortex1": "eval_vortex",
14 |     "vortex2": "eval_vortex2",
15 |     "vortex3": "eval_vortex3",
16 |     "pierre": "eval_pierre",
17 |     "dakota": "eval_dakota",
18 |     "saucy": "eval_saucy",
19 |     "ray and cooper2": "eval_ray2",
20 |     "design house escape": "eval_design",
21 |     "mirror room escape": "eval_mirror",
22 |     "pico sim date": "eval_pico",
23 |     "festival days sim date": "eval_festival",
24 |     "kingdom days": "eval_kingdom",
25 |     "idol days sim date": "eval_idol",
26 |     "community college sim": "eval_college",
27 |     "grim tales the bride": "eval_grim1",
28 |     "grim tales the legacy collectors edition": "eval_grim2",
29 |     "chemical room escape": "eval_chemical",
30 |     "computer office escape": "eval_computer",
31 |     "crimson room": "eval_crimson",
32 |     "geometric room escape": "eval_geometric",
33 |     "machine room escape": "eval_machine",
34 |     "sort the court": "eval_sort",
35 |     "space museum escape": "eval_space",
36 |     "camping room escape" : "eval_camping",
37 |     "vending machine room escape": "eval_vending",
38 |     "wood workshop escape": "eval_wood",
39 |     "elevator room escape" : "eval_elevator"
40 | }
41 | 
42 | def choose_game():
43 |     print("Available evaluations:")
44 |     for i, name in enumerate(AVAILABLE_EVALS, 1):
45 |         print(f"{i}. {name}")
46 |     idx = int(input("Select game to evaluate: ")) - 1
47 |     return list(AVAILABLE_EVALS.values())[idx]
48 | 
49 | async def main():
50 |     module_path = "./eval_game" 
51 |     module_name = choose_game()
52 |     file_path = os.path.join(module_path, f"{module_name}.py")  
53 | 
54 |     spec = importlib.util.spec_from_file_location(module_name, file_path)
55 |     module = importlib.util.module_from_spec(spec)
56 |     spec.loader.exec_module(module)
57 | 
58 |     func_name = f"eval_{module_name.split('_')[-1]}"
59 |     await module.__dict__[func_name]()
60 | 
61 | if __name__ == "__main__":
62 |     asyncio.run(main())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_vortex2.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_vortex2():
17 |     game = "Vortex Point2"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "vortex_point2",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     try:
37 |         for i in range(1, 5):
38 |             key = f"milestone_prompt{i}"
39 |             result_obj["last_attempted"] = key
40 |             print(f"\n🏁 {key}")
41 | 
42 |             convo = await run_milestone(data[key], f"{game}_{key}")
43 |             result = extract_boolean_result(convo)
44 | 
45 |             match i:
46 |                 case 1: result_obj["results"]["pub_owner_gone"] = result
47 |                 case 2: result_obj["results"]["magician_gone"] = result
48 |                 case 3: result_obj["results"]["security_guard_gone"] = result
49 |                 case 4: result_obj["results"]["fence_open"] = result
50 | 
51 |             print(f"✅ Result: {result}")
52 | 
53 |             if result is not True:
54 |                 print(f"🛑 Condition failed at {key}. Stopping evaluation.")
55 |                 result_obj["failed_at"] = key
56 |                 return
57 | 
58 |         print("🎉 All milestones passed successfully!")
59 | 
60 |     except KeyboardInterrupt:
61 |         print("\n⚠️ Evaluation interrupted by user.")
62 | 
63 |     finally:
64 |         Path("results").mkdir(exist_ok=True)
65 |         with open("results/result_vortex2.json", "w") as f:
66 |             json.dump(result_obj, f, indent=2)
67 |         print("📝 Result saved to results/result_vortex2.json")
68 | 
69 | if __name__ == "__main__":
70 |     asyncio.run(eval_vortex2())


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from dotenv import load_dotenv
 4 | import json
 5 | import base64
 6 | from PIL import Image
 7 | from io import BytesIO
 8 | import io
 9 | from urllib.parse import urlparse
10 | 
11 | load_dotenv(override=True)
12 | 
13 | BLOCKED_DOMAINS = [
14 |     "maliciousbook.com",
15 |     "evilvideos.com",
16 |     "darkwebforum.com",
17 |     "shadytok.com",
18 |     "suspiciouspins.com",
19 |     "ilanbigio.com",
20 | ]
21 | 
22 | 
23 | def pp(obj):
24 |     print(json.dumps(obj, indent=4))
25 | 
26 | 
27 | def show_image(base_64_image):
28 |     image_data = base64.b64decode(base_64_image)
29 |     image = Image.open(BytesIO(image_data))
30 |     image.show()
31 | 
32 | 
33 | def calculate_image_dimensions(base_64_image):
34 |     image_data = base64.b64decode(base_64_image)
35 |     image = Image.open(io.BytesIO(image_data))
36 |     return image.size
37 | 
38 | 
39 | def sanitize_message(msg: dict) -> dict:
40 |     """Return a copy of the message with image_url omitted for computer_call_output messages."""
41 |     if msg.get("type") == "computer_call_output":
42 |         output = msg.get("output", {})
43 |         if isinstance(output, dict):
44 |             sanitized = msg.copy()
45 |             sanitized["output"] = {**output, "image_url": "[omitted]"}
46 |             return sanitized
47 |     return msg
48 | 
49 | 
50 | def create_response(**kwargs):
51 |     url = "https://api.openai.com/v1/responses"
52 |     headers = {
53 |         "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
54 |         "Content-Type": "application/json"
55 |     }
56 | 
57 |     openai_org = os.getenv("OPENAI_ORG")
58 |     if openai_org:
59 |         headers["Openai-Organization"] = openai_org
60 | 
61 |     try:
62 |         response = requests.post(url, headers=headers, json=kwargs, timeout=30)
63 |         response.raise_for_status()  # 4xx/5xx 오류 시 예외 발생
64 | 
65 |         return response.json()  # 정상 시 항상 JSON이므로 예외 거의 없음
66 | 
67 |     except requests.exceptions.RequestException as e:
68 |         print("[ERROR] OpenAI API 요청 실패:", str(e))
69 |         return {}
70 | 
71 | 
72 | def check_blocklisted_url(url: str) -> None:
73 |     """Raise ValueError if the given URL (including subdomains) is in the blocklist."""
74 |     hostname = urlparse(url).hostname or ""
75 |     if any(
76 |         hostname == blocked or hostname.endswith(f".{blocked}")
77 |         for blocked in BLOCKED_DOMAINS
78 |     ):
79 |         raise ValueError(f"Blocked URL: {url}")
80 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_saucy.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | 
10 | def extract_boolean_result(conversation: List[str]) -> bool | None:
11 |     for msg in reversed(conversation):
12 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
13 |         if match:
14 |             return match.group(1).lower() == "true"
15 |     return None
16 | 
17 | async def eval_saucy():
18 |     game = "Saucy Devil Gordon"
19 | 
20 |     with open("milestone_prompts.json", "r") as f:
21 |         all_data = json.load(f)
22 | 
23 |     data = all_data.get(game, {})
24 |     if not data:
25 |         print(f"❌ No data found for game: {game}")
26 |         return
27 | 
28 |     result_obj = {
29 |         "game": "saucy_devil_gordon",
30 |         "results": {},
31 |         "failed_at": None,
32 |         "last_attempted": None
33 |     }
34 | 
35 |     print(f"🔍 Running evaluation for: {game}")
36 | 
37 |     milestones = [
38 |         ("milestone_prompt1", "pick_coconut", "🥥 Coconut Picked"),
39 |         ("milestone_prompt2", "pick_pineapple", "🍍 Pineapple Picked"),
40 |         ("milestone_prompt3", "door_open", "🚪 Door is Open"),
41 |         ("milestone_prompt4", "rock_light", "💡 Rock Light"),
42 |     ]
43 | 
44 |     try:
45 |         for key, result_key, label in milestones:
46 |             result_obj["last_attempted"] = key
47 |             print(f"\n🏁 {key}")
48 | 
49 |             if key not in data:
50 |                 print(f"⚠️ Skipping {key} (not in prompt data)")
51 |                 continue
52 | 
53 |             convo = await run_milestone(data[key], f"{game}_{key}")
54 |             result = extract_boolean_result(convo)
55 |             result_obj["results"][result_key] = result
56 | 
57 |             print(f"{label}: {result}")
58 | 
59 |             if result is not True:
60 |                 print(f"🛑 {label} failed. Stopping evaluation.")
61 |                 result_obj["failed_at"] = key
62 |                 return
63 | 
64 |         print("🎉 All milestones passed successfully!")
65 | 
66 |     except KeyboardInterrupt:
67 |         print("\n⚠️ Evaluation interrupted by user.")
68 | 
69 |     finally:
70 |         Path("results").mkdir(exist_ok=True)
71 |         with open("results/result_saucy.json", "w") as f:
72 |             json.dump(result_obj, f, indent=2)
73 |         print("📝 Result saved to results/result_saucy.json")
74 | 
75 | if __name__ == "__main__":
76 |     asyncio.run(eval_saucy())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_design.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_design():
17 |     game = "Design House Escape"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "design_house_escape",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     milestones = [
37 |         ("milestone_prompt1", "cube_exists", "🧊 Cube Exists"),
38 |         ("milestone_prompt2", "1st_door_open", "🚪 1st Door Open"),
39 |         ("milestone_prompt3", "2nd_door_open", "🚪 2nd Door Open"),
40 |         ("milestone_prompt4", "3rd_door_open", "🚪 3rd Door Open"),
41 |     ]
42 | 
43 |     try:
44 |         for key, result_key, label in milestones:
45 |             result_obj["last_attempted"] = key
46 |             print(f"\n🏁 {key}")
47 | 
48 |             if key not in data:
49 |                 print(f"⚠️ Skipping {key} (not found in prompt data)")
50 |                 continue
51 | 
52 |             convo = await run_milestone(data[key], f"{game}_{key}")
53 |             result = extract_boolean_result(convo)
54 |             result_obj["results"][result_key] = result
55 | 
56 |             print(f"{label}: {result}")
57 | 
58 |             if result is not True:
59 |                 print(f"🛑 {label} failed. Stopping evaluation.")
60 |                 result_obj["failed_at"] = key
61 |                 return
62 | 
63 |         print("🎉 All milestones passed successfully!")
64 | 
65 |     except KeyboardInterrupt:
66 |         print("\n⚠️ Evaluation interrupted by user.")
67 | 
68 |     finally:
69 |         Path("results").mkdir(exist_ok=True)
70 |         with open("results/result_design.json", "w") as f:
71 |             json.dump(result_obj, f, indent=2)
72 |         print("📝 Result saved to results/result_design.json")
73 | 
74 | if __name__ == "__main__":
75 |     asyncio.run(eval_design())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_vortex3.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_vortex3():
17 |     game = "Vortex Point3"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "vortex_point3",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     # milestone
37 |     milestones = [
38 |         ("milestone_prompt1", "eat_hamburger", "Man eating hamburger"),
39 |         ("milestone_prompt2", "toilet_open", "Toilet Open"),
40 |         ("milestone_prompt3", "shopkeeper_gone", "Shop Keeper Gone"),
41 |         ("milestone_prompt4", "ride_boat", "Ride Boat")
42 |     ]
43 | 
44 |     try:
45 |         for key, result_key, label in milestones:
46 |             result_obj["last_attempted"] = key
47 |             print(f"\n🏁 {key}")
48 | 
49 |             if key not in data:
50 |                 print(f"⚠️ Skipping {key} (not present in prompt data)")
51 |                 continue
52 | 
53 |             convo = await run_milestone(data[key], f"{game}_{key}")
54 |             result = extract_boolean_result(convo)
55 |             result_obj["results"][result_key] = result
56 | 
57 |             print(f"{label}: {result}")
58 | 
59 |             if result is not True:
60 |                 print(f"🛑 {label} failed. Stopping evaluation.")
61 |                 result_obj["failed_at"] = key
62 |                 return
63 | 
64 |         print("🎉 All milestones passed successfully!")
65 | 
66 |     except KeyboardInterrupt:
67 |         print("\n⚠️ Evaluation interrupted by user.")
68 | 
69 |     finally:
70 |         Path("results").mkdir(exist_ok=True)
71 |         with open("results/result_vortex3.json", "w") as f:
72 |             json.dump(result_obj, f, indent=2)
73 |         print("📝 Result saved to results/result_vortex3.json")
74 | 
75 | if __name__ == "__main__":
76 |     asyncio.run(eval_vortex3())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_mirror.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_mirror():
17 |     game = "Mirror Room Escape"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "mirror_room_escape",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     milestones = [
37 |         ("milestone_prompt1", "colorful_door_open", "🎨 Colorful Door Open"),
38 |         ("milestone_prompt2", "cabinet_mirror_open", "🪞 Cabinet Mirror Open"),
39 |         ("milestone_prompt3", "display_show", "🖥️ Display Shown"),
40 |         ("milestone_prompt4", "2nd_door_open", "🚪 2nd Door Open"),
41 |     ]
42 | 
43 |     try:
44 |         for key, result_key, label in milestones:
45 |             result_obj["last_attempted"] = key
46 |             print(f"\n🏁 {key}")
47 | 
48 |             if key not in data:
49 |                 print(f"⚠️ Skipping {key} (not found in prompt data)")
50 |                 continue
51 | 
52 |             convo = await run_milestone(data[key], f"{game}_{key}")
53 |             result = extract_boolean_result(convo)
54 |             result_obj["results"][result_key] = result
55 | 
56 |             print(f"{label}: {result}")
57 | 
58 |             if result is not True:
59 |                 print(f"🛑 {label} failed. Stopping evaluation.")
60 |                 result_obj["failed_at"] = key
61 |                 return
62 | 
63 |         print("🎉 All milestones passed successfully!")
64 | 
65 |     except KeyboardInterrupt:
66 |         print("\n⚠️ Evaluation interrupted by user.")
67 | 
68 |     finally:
69 |         Path("results").mkdir(exist_ok=True)
70 |         with open("results/result_mirror.json", "w") as f:
71 |             json.dump(result_obj, f, indent=2)
72 |         print("📝 Result saved to results/result_mirror.json")
73 | 
74 | if __name__ == "__main__":
75 |     asyncio.run(eval_mirror())


--------------------------------------------------------------------------------
/game_agent/cradle/tools/load_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def save_chat_log(entry, game_name, api_model, cua):
 6 |     """
 7 |     Save game action logs to a JSON file.
 8 |     Path: json/{api_model}/{game_name}/{cua}/game_log.json
 9 |     """
10 |     log_dir = os.path.join("json", api_model, game_name, cua)
11 |     os.makedirs(log_dir, exist_ok=True)
12 | 
13 |     log_file = os.path.join(log_dir, "game_log.json")
14 | 
15 |     logs = []
16 |     if os.path.exists(log_file):
17 |         with open(log_file, "r", encoding="utf-8") as f:
18 |             try:
19 |                 logs = json.load(f)
20 |             except json.JSONDecodeError:
21 |                 logs = []
22 | 
23 |     logs.append(entry)
24 |     with open(log_file, "w", encoding="utf-8") as f:
25 |         json.dump(logs, f, ensure_ascii=False, indent=4)
26 | 
27 | 
28 | def load_game_prompt(game_name):
29 |     """ Load the prompt and control keys for a specific game from JSON """
30 |     json_path = "./json/game_prompts.json"
31 |     with open(json_path, "r", encoding="utf-8") as f:
32 |         game_data = json.load(f)
33 | 
34 |     if game_name in game_data:
35 |         g = game_data[game_name]
36 |         return g["prompt"]
37 |     else:
38 |         raise ValueError(f"No prompt found for game '{game_name}'.")
39 | 
40 | 
41 | def load_system_prompt(game_name):
42 |     """ Load the system prompt for a specific game from JSON """
43 |     json_path = "./json/game_prompts.json"
44 |     with open(json_path, "r", encoding="utf-8") as f:
45 |         game_data = json.load(f)
46 | 
47 |     if game_name in game_data:
48 |         g = game_data[game_name]
49 |         return g["system_prompt"]
50 |     else:
51 |         raise ValueError(f"No system prompt found for game '{game_name}'.")
52 | 
53 | 
54 | def load_memory_prompt(game_name, memory_type="task", path_map=None):
55 |     """
56 |     Load memory files from JSON depending on the memory type.
57 |     memory_type: "task" | "skill"
58 |     path_map: optional dict to customize paths
59 |     """
60 |     default_paths = {
61 |         "task": f"./{game_name}/task_memory.json",
62 |         "skill": f"./{game_name}/skills.json"
63 |     }
64 | 
65 |     if path_map:
66 |         default_paths.update(path_map)
67 |     if memory_type not in default_paths:
68 |         raise ValueError("memory_type must be either 'task' or 'skill'.")
69 | 
70 |     json_path = default_paths[memory_type]
71 | 
72 |     if not os.path.exists(json_path):
73 |         return {} if memory_type == "skill" else []
74 |     with open(json_path, "r", encoding="utf-8") as f:
75 |         return json.load(f)


--------------------------------------------------------------------------------
/game_agent/UI-Tars/tools/load_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def save_chat_log(entry, game_name, api_model, cua):
 6 |     """
 7 |     Saves game action logs to a JSON file.
 8 |     Path: json/{api_model}/{game_name}/{cua}/game_log.json
 9 |     """
10 |     log_dir = os.path.join("json", api_model, game_name, cua)
11 |     os.makedirs(log_dir, exist_ok=True)
12 | 
13 |     log_file = os.path.join(log_dir, "game_log.json")
14 | 
15 |     logs = []
16 |     if os.path.exists(log_file):
17 |         with open(log_file, "r", encoding="utf-8") as f:
18 |             try:
19 |                 logs = json.load(f)
20 |             except json.JSONDecodeError:
21 |                 logs = []
22 | 
23 |     logs.append(entry)
24 |     with open(log_file, "w", encoding="utf-8") as f:
25 |         json.dump(logs, f, ensure_ascii=False, indent=4)
26 | 
27 | 
28 | def load_game_prompt(game_name):
29 |     """ Loads the prompt and action keys for a specific game from a JSON file """
30 |     json_path = "./json/game_prompts.json"
31 |     with open(json_path, "r", encoding="utf-8") as f:
32 |         game_data = json.load(f)
33 | 
34 |     if game_name in game_data:
35 |         g = game_data[game_name]
36 |         return g["prompt"]
37 |     else:
38 |         raise ValueError(f"No prompt exists for game '{game_name}'.")
39 |     
40 | def load_system_prompt(game_name):
41 |     """ Loads the prompt and action keys for a specific game from a JSON file """
42 |     json_path = "./json/game_prompts.json"
43 |     with open(json_path, "r", encoding="utf-8") as f:
44 |         game_data = json.load(f)
45 | 
46 |     if game_name in game_data:
47 |         g = game_data[game_name]
48 |         return g["system_prompt"]
49 |     else:
50 |         raise ValueError(f"No system prompt exists for game '{game_name}'.")
51 | 
52 | def load_memory_prompt(game_name, memory_type="task", path_map=None):
53 |     """
54 |     Loads the JSON memory file based on memory type.
55 |     memory_type: "task" | "skill"
56 |     path_map: Optional dict to customize the path
57 |     """
58 |     default_paths = {
59 |         "task": f"./{game_name}/task_memory.json",
60 |         "skill": f"./{game_name}/skills.json"
61 |     }
62 | 
63 |     if path_map:
64 |         default_paths.update(path_map)
65 |     if memory_type not in default_paths:
66 |         raise ValueError("memory_type must be 'task' or 'skill'.")
67 | 
68 |     json_path = default_paths[memory_type]
69 | 
70 |     if not os.path.exists(json_path):
71 |         return {} if memory_type == "skill" else []
72 |     with open(json_path, "r", encoding="utf-8") as f:
73 |         return json.load(f)


--------------------------------------------------------------------------------
/evaluator/judge/vlm/tools/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import base64
 3 | import re
 4 | 
 5 | def encode_image(image_path):
 6 |     """
 7 |     Read a file from disk and return its contents as a base64-encoded string.
 8 |     """
 9 |     with open(image_path, "rb") as image_file:
10 |         return base64.b64encode(image_file.read()).decode("utf-8")
11 | 
12 | def log_output(thread_id, log_text, game):
13 |     """
14 |     Logs output to `cache/thread_{thread_id}/output.log`
15 |     """
16 |     thread_folder = f"cache/{game}/thread_{thread_id}"
17 |     os.makedirs(thread_folder, exist_ok=True)
18 |     
19 |     log_path = os.path.join(thread_folder, "output.log")
20 |     with open(log_path, "w", encoding="utf-8") as log_file:
21 |         log_file.write(log_text + "\n\n")
22 | 
23 | def extract_python_code(content):
24 |     if not content:
25 |         print("[ERROR] extract_python_code() received empty content")
26 |         return "", ""
27 | 
28 |     print(f"[DEBUG] Raw content received:\n{content}\n")
29 | 
30 |     # 🔹 "code" 키 다음에 나오는 Python 코드 부분만 추출
31 |     match = re.search(r'"code"\s*:\s*"""\s*(.*?)\s*"""', content, re.DOTALL)
32 |     action_match = re.search(r'"action":\s*"([^"]+)"', content)  # 🔹 action 값 찾기
33 | 
34 |     action_text = action_match.group(1) if action_match else ""  # 🔹 문자열 값만 추출
35 | 
36 |     if match:
37 |         code_content = match.group(1)  # Python 코드 부분만 가져오기
38 | 
39 |         # 🔹 주석 제거 (멀티라인 """ """ 주석 & 단일 줄 # 주석)
40 |         code_content = re.sub(r'""".*?"""', '', code_content, flags=re.DOTALL).strip()
41 |         code_content = re.sub(r'^\s*#.*$', '', code_content, flags=re.MULTILINE).strip()
42 | 
43 |         print(f"[DEBUG] Extracted Code:\n{code_content}\n")
44 |         print(f"[DEBUG] Extracted Action:\n{action_text}\n")  # 🔹 action 값 출력
45 |         return code_content, action_text
46 | 
47 |     print("[ERROR] No Python code found in content.")
48 |     return "", action_text  # 🔹 항상 action_text도 반환
49 | 
50 | ### action 후에 화면 변화했는지 찾기
51 | def extract_action_change(content):  # content = "reason: ... Success_Action: True"
52 |     if not content:
53 |         print("empty content")
54 |         return ""
55 |     
56 |     print(f"[DEBUG] Raw content received:\n{content}\n")
57 |     
58 |     # "Success_Action: True" 또는 "Success_Action: False" 찾기
59 |     match = re.search(r"Success_Action:\s*(True|False)", content, re.IGNORECASE)
60 |     
61 |     if match:
62 |         result_success = match.group(1).lower() == "true"  # 문자열을 boolean으로 변환
63 |         return result_success  # True 또는 False 반환
64 | 
65 |     print("[WARNING] Success_Action not found in content.")
66 |     return ""
67 | 


--------------------------------------------------------------------------------
/game_agent/UI-Tars/desktop_env/desktop_env.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | from typing import Tuple, Dict, Any, Optional
 4 | 
 5 | from desktop_env.controllers.python import LocalController
 6 | logger = logging.getLogger("desktopenv.env")
 7 | 
 8 | 
 9 | class LocalDesktopEnv:
10 |     def __init__(
11 |         self,
12 |         game_name,
13 |         action_space: str = "pyautogui",
14 |         screen_size: Tuple[int, int] = (1920, 1080),
15 |         require_a11y_tree: bool = False,
16 |         require_terminal: bool = False,
17 |     ):
18 |         assert action_space == "pyautogui", "Only 'pyautogui' is supported in local mode."
19 | 
20 |         self.action_space = action_space
21 |         self.screen_size = screen_size
22 |         self.require_a11y_tree = require_a11y_tree
23 |         self.require_terminal = require_terminal
24 | 
25 |         self.instruction = ""
26 |         self.task_id = "default_task"
27 |         self.action_history = []
28 | 
29 |         self.controller = LocalController("./screehshots", game_name)  
30 | 
31 |     def reset(self, task_config: Optional[Dict[str, Any]] = None, **kwargs) -> Dict[str, Any]:
32 |         self.action_history.clear()
33 |         self.task_id = task_config.get("id", "default_task") if task_config else "default_task"
34 |         self.instruction = task_config.get("instruction", "") if task_config else ""
35 |         return self._get_obs()
36 | 
37 |     def _get_obs(self):
38 |         return {
39 |             "screenshot": self.controller.get_screenshot(),
40 |             "accessibility_tree": None,
41 |             "terminal": None,
42 |             "instruction": self.instruction,
43 |         }
44 | 
45 |     def step(self, action: str, pause: float = 1.0):
46 |         self.action_history.append(action)
47 | 
48 |         done = False
49 |         info = {}
50 | 
51 |         if action == "WAIT":
52 |             time.sleep(pause)
53 |         elif action == "FAIL":
54 |             done = True
55 |             info["fail"] = True
56 |         elif action == "DONE":
57 |             done = True
58 |             info["done"] = True
59 |         else:
60 |             self.controller.execute_python_command(action)
61 | 
62 |         time.sleep(pause)
63 |         obs = self._get_obs()
64 |         return obs, 0.0, done, info
65 | 
66 |     def evaluate(self):
67 |         return 0.0  
68 | 
69 |     def close(self):
70 |         logger.info("LocalDesktopEnv closed.")
71 | 
72 |     def render(self, mode="rgb_array"):
73 |         if mode == "rgb_array":
74 |             return self.controller.get_screenshot()
75 |         raise ValueError(f"Unsupported render mode: {mode}")


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_ray2.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_ray2():
17 |     game = "Ray and Cooper 2"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "ray and cooper 2",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     milestones = [
37 |         ("milestone_prompt1", "vent_open", "🌀 Vent Open"),
38 |         ("milestone_prompt2", "chef_disappear", "👨‍🍳 Chef Disappeared"),
39 |         ("milestone_prompt3", "door_open", "🚪 Door Open"),
40 |         ("milestone_prompt4", "attendant_disappear", "🧍‍♂️ Attendant Disappeared"),
41 |         ("milestone_prompt5", "woman_disappear", "👩 Woman Disappeared"),
42 |     ]
43 | 
44 |     try:
45 |         for key, result_key, label in milestones:
46 |             result_obj["last_attempted"] = key
47 |             print(f"\n🏁 {key}")
48 | 
49 |             if key not in data:
50 |                 print(f"⚠️ Skipping {key} (not found in prompt data)")
51 |                 continue
52 | 
53 |             convo = await run_milestone(data[key], f"{game}_{key}")
54 |             result = extract_boolean_result(convo)
55 |             result_obj["results"][result_key] = result
56 | 
57 |             print(f"{label}: {result}")
58 | 
59 |             if result is not True:
60 |                 print(f"🛑 {label} failed. Stopping evaluation.")
61 |                 result_obj["failed_at"] = key
62 |                 return
63 | 
64 |         print("🎉 All milestones passed successfully!")
65 | 
66 |     except KeyboardInterrupt:
67 |         print("\n⚠️ Evaluation interrupted by user.")
68 | 
69 |     finally:
70 |         Path("results").mkdir(exist_ok=True)
71 |         with open("results/result_ray2.json", "w") as f:
72 |             json.dump(result_obj, f, indent=2)
73 |         print("📝 Result saved to results/result_ray2.json")
74 | 
75 | if __name__ == "__main__":
76 |     asyncio.run(eval_ray2())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_pierre.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_pierre():
17 |     game = "Pierre Hotel"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "pierre hotel",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     milestones = [
37 |         ("milestone_prompt1", "bartendor_on_the_phone", "📞 Bartender on the phone"),
38 |         ("milestone_prompt2", "broom_closet_door_open", "🚪 Broom Closet Door Open"),
39 |         ("milestone_prompt3", "fireplace_out", "🔥 Fireplace Out"),
40 |         ("milestone_prompt4", "elevator_works", "🛗 Elevator Works"),
41 |         ("milestone_prompt5", "vampire_gone", "🧛 Vampire Gone"),
42 |     ]
43 | 
44 |     try:
45 |         for key, result_key, label in milestones:
46 |             result_obj["last_attempted"] = key
47 |             print(f"\n🏁 {key}")
48 | 
49 |             if key not in data:
50 |                 print(f"⚠️ Skipping {key} (not present in prompt data)")
51 |                 continue
52 | 
53 |             convo = await run_milestone(data[key], f"{game}_{key}")
54 |             result = extract_boolean_result(convo)
55 |             result_obj["results"][result_key] = result
56 | 
57 |             print(f"{label}: {result}")
58 | 
59 |             if result is not True:
60 |                 print(f"🛑 {label} failed. Stopping evaluation.")
61 |                 result_obj["failed_at"] = key
62 |                 return
63 | 
64 |         print("🎉 All milestones passed successfully!")
65 | 
66 |     except KeyboardInterrupt:
67 |         print("\n⚠️ Evaluation interrupted by user.")
68 | 
69 |     finally:
70 |         Path("results").mkdir(exist_ok=True)
71 |         with open("results/result_pierre.json", "w") as f:
72 |             json.dump(result_obj, f, indent=2)
73 |         print("📝 Result saved to results/result_pierre.json")
74 | 
75 | if __name__ == "__main__":
76 |     asyncio.run(eval_pierre())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_dakota.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_boolean_result(conversation: List[str]) -> bool | None:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
12 |         if match:
13 |             return match.group(1).lower() == "true"
14 |     return None
15 | 
16 | async def eval_dakota():
17 |     game = "Dakota Winchester's Adventures"
18 | 
19 |     with open("milestone_prompts.json", "r") as f:
20 |         all_data = json.load(f)
21 | 
22 |     data = all_data.get(game, {})
23 |     if not data:
24 |         print(f"❌ No data found for game: {game}")
25 |         return
26 | 
27 |     result_obj = {
28 |         "game": "dakota winchester's adventures",
29 |         "results": {},
30 |         "failed_at": None,
31 |         "last_attempted": None
32 |     }
33 | 
34 |     print(f"🔍 Running evaluation for: {game}")
35 | 
36 |     milestones = [
37 |         ("milestone_prompt1", "stones_connected", "🪨 Stepping Stones Connected"),
38 |         ("milestone_prompt2", "fire_burns", "🔥 Fire Burns"),
39 |         ("milestone_prompt3", "temple_open", "🏛️ Temple Open"),
40 |         ("milestone_prompt4", "monkey_banana", "🐒 Monkey Eats Banana"),
41 |         ("milestone_prompt5", "lights_illuminate", "💡 Lights Illuminate"),
42 |     ]
43 | 
44 |     try:
45 |         for key, result_key, label in milestones:
46 |             result_obj["last_attempted"] = key
47 |             print(f"\n🏁 {key}")
48 | 
49 |             if key not in data:
50 |                 print(f"⚠️ Skipping {key} (not found in prompt data)")
51 |                 continue
52 | 
53 |             convo = await run_milestone(data[key], f"{game}_{key}")
54 |             result = extract_boolean_result(convo)
55 |             result_obj["results"][result_key] = result
56 | 
57 |             print(f"{label}: {result}")
58 | 
59 |             if result is not True:
60 |                 print(f"🛑 {label} failed. Stopping evaluation.")
61 |                 result_obj["failed_at"] = key
62 |                 return
63 | 
64 |         print("🎉 All milestones passed successfully!")
65 | 
66 |     except KeyboardInterrupt:
67 |         print("\n⚠️ Evaluation interrupted by user.")
68 | 
69 |     finally:
70 |         Path("results").mkdir(exist_ok=True)
71 |         with open("results/result_dakota.json", "w") as f:
72 |             json.dump(result_obj, f, indent=2)
73 |         print("📝 Result saved to results/result_dakota.json")
74 | 
75 | if __name__ == "__main__":
76 |     asyncio.run(eval_dakota())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_vortex.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_found_places(conversation: List[str]) -> int:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"#Found Place:\s*(\d+)", msg, re.IGNORECASE)
12 |         if match:
13 |             return int(match.group(1))
14 |     return -1
15 | 
16 | def extract_boolean_result(conversation: List[str]) -> bool | None:
17 |     for msg in reversed(conversation):
18 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
19 |         if match:
20 |             return match.group(1).lower() == "true"
21 |     return None
22 | 
23 | async def eval_vortex():
24 |     game = "Vortex Point1"
25 | 
26 |     with open("milestone_prompts.json", "r") as f:
27 |         all_data = json.load(f)
28 | 
29 |     data = all_data.get(game, {})
30 |     if not data:
31 |         print(f"❌ No data found for game: {game}")
32 |         return
33 | 
34 |     result_obj = {
35 |         "game": "vortex_point1",
36 |         "results": {}
37 |     }
38 | 
39 |     print(f"🔍 Running evaluation for: {game}")
40 | 
41 |     # === milestone_prompt1 ===
42 |     key = "milestone_prompt1"
43 |     print(f"\n🏁 {key}")
44 |     convo1 = await run_milestone(data[key], f"{game}_{key}")
45 |     places = extract_found_places(convo1)
46 | 
47 |     result_obj["results"]["found_places"] = places
48 |     print(f"📌 Found Places: {places}")
49 | 
50 |     if places != 8:
51 |         print("🛑 Found Places is not exactly 8. Stopping evaluation.")
52 |         return
53 | 
54 |     # === milestone_prompt2 ===
55 |     key = "milestone_prompt2"
56 |     print(f"\n🏁 {key}")
57 |     convo2 = await run_milestone(data[key], f"{game}_{key}")
58 |     result2 = extract_boolean_result(convo2)
59 | 
60 |     result_obj["results"]["door_2956_open"] = result2
61 |     print(f"🚪 2956 Vineyard Drive Door Open: {result2}")
62 | 
63 |     if result2 is not True:
64 |         print("🛑 Door is not open. Stopping evaluation.")
65 |         return
66 | 
67 |     # === milestone_prompt3 ===
68 |     key = "milestone_prompt3"
69 |     print(f"\n🏁 {key}")
70 |     convo3 = await run_milestone(data[key], f"{game}_{key}")
71 |     result3 = extract_boolean_result(convo3)
72 | 
73 |     result_obj["results"]["wing_c_open"] = result3
74 |     print(f"🚪 Wing C Door Open: {result3}")
75 | 
76 |     Path("results").mkdir(exist_ok=True)
77 |     with open("results/result_vortex.json", "w") as f:
78 |         json.dump(result_obj, f, indent=2)
79 |     print("📝 Result saved to results/result_vortex.json")
80 | 
81 | if __name__ == "__main__":
82 |     asyncio.run(eval_vortex())


--------------------------------------------------------------------------------
/game_agent/UI-Tars/desktop_env/controllers/python.py:
--------------------------------------------------------------------------------
 1 | import pyautogui
 2 | import time
 3 | from mss import mss
 4 | from PIL import Image
 5 | from datetime import datetime
 6 | import os
 7 | 
 8 | 
 9 | class LocalController:
10 |     def __init__(self, screenshot_dir="screenshots", game_name="None"):
11 |         self.screenshot_dir = f"{screenshot_dir}/{game_name}"
12 | 
13 |         if not os.path.exists(self.screenshot_dir):
14 |             os.makedirs(self.screenshot_dir)
15 | 
16 |     def get_screenshot(self) -> str:
17 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
18 |         path = os.path.join(self.screenshot_dir, f"screenshot_{timestamp}.png")
19 | 
20 |         with mss() as sct:
21 |             monitor = sct.monitors[1]  
22 |             sct_img = sct.grab(monitor)
23 |             img = Image.frombytes("RGB", sct_img.size, sct_img.rgb)
24 |             img.save(path)
25 | 
26 |         return path
27 | 
28 |     def execute_python_command(self, command: str):
29 |         exec(command)
30 | 
31 |     def execute_action(self, action):
32 |         if isinstance(action, str):
33 |             if action == "WAIT":
34 |                 time.sleep(2)
35 |             elif action in ["DONE", "FAIL"]:
36 |                 return
37 |             else:
38 |                 exec(action)
39 |         elif isinstance(action, dict):
40 |             action_type = action.get("action_type")
41 |             params = action.get("parameters", {})
42 | 
43 |             if action_type == "MOVE_TO":
44 |                 pyautogui.moveTo(params.get("x", 0), params.get("y", 0))
45 |             elif action_type == "CLICK":
46 |                 pyautogui.click(params.get("x", None), params.get("y", None), button=params.get("button", "left"))
47 |             elif action_type == "DOUBLE_CLICK":
48 |                 pyautogui.doubleClick()
49 |             elif action_type == "TYPING":
50 |                 pyautogui.typewrite(params.get("text", ""))
51 |             elif action_type == "KEY_DOWN":
52 |                 pyautogui.keyDown(params.get("key", ""))
53 |             elif action_type == "KEY_UP":
54 |                 pyautogui.keyUp(params.get("key", ""))
55 |             elif action_type == "HOTKEY":
56 |                 keys = params.get("keys", [])
57 |                 pyautogui.hotkey(*keys)
58 |             elif action_type == "WAIT":
59 |                 time.sleep(2)
60 |             elif action_type in ["DONE", "FAIL"]:
61 |                 pass
62 |             else:
63 |                 print(f"Unknown action type: {action_type}")
64 |         else:
65 |             print("Invalid action format")
66 | 
67 |     def start_recording(self):
68 |         print("[Recording start - no-op for local]")
69 | 
70 |     def end_recording(self, path):
71 |         print(f"[Recording end - no-op for local] Saved to {path}")


--------------------------------------------------------------------------------
/game_agent/UI-Tars/lib_run_single.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import logging
 4 | import os
 5 | import time
 6 | 
 7 | logger = logging.getLogger("desktopenv.experiment")
 8 | 
 9 | def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
10 |     runtime_logger = setup_logger(example, example_result_dir)
11 |     agent.reset(runtime_logger)
12 |     env.reset(task_config=example)
13 |     time.sleep(3)  # Shorter wait for local environment
14 |     obs = env._get_obs()
15 |     done = False
16 |     step_idx = 0
17 | 
18 |     while not done and step_idx < max_steps:
19 |         response, actions = agent.predict(instruction, obs)
20 | 
21 |         for action in actions:
22 |             action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
23 |             logger.info("Step %d: %s", step_idx + 1, action)
24 | 
25 |             obs, reward, done, info = env.step(action, args.sleep_after_execution)
26 | 
27 |             logger.info("Reward: %.2f", reward)
28 |             logger.info("Done: %s", done)
29 | 
30 |             # Save screenshot
31 |             screenshot_filename = f"step_{step_idx + 1}_{action_timestamp}.png"
32 |             screenshot_path = os.path.join(example_result_dir, screenshot_filename)
33 |             if hasattr(obs['screenshot'], "save"):
34 |                 obs['screenshot'].save(screenshot_path)
35 |             else:
36 |                 logger.warning("Screenshot object has no .save() method — skipping save.")
37 | 
38 |             # Log trajectory
39 |             traj_path = os.path.join(example_result_dir, "traj.jsonl")
40 |             with open(traj_path, "a", encoding="utf-8") as f:
41 |                 f.write(json.dumps({
42 |                     "step_num": step_idx + 1,
43 |                     "action_timestamp": action_timestamp,
44 |                     "action": action,
45 |                     "reward": reward,
46 |                     "done": done,
47 |                     "info": info,
48 |                     "screenshot_file": screenshot_filename
49 |                 }) + "\n")
50 | 
51 |             if done:
52 |                 logger.info("The episode is done.")
53 |                 break
54 | 
55 |         step_idx += 1
56 | 
57 |     # Dummy evaluation for local
58 |     result = env.evaluate()
59 |     logger.info("Result: %.2f", result)
60 |     scores.append(result)
61 | 
62 |     result_path = os.path.join(example_result_dir, "result.txt")
63 |     with open(result_path, "w", encoding="utf-8") as f:
64 |         f.write(f"{result}\n")
65 | 
66 | 
67 | def setup_logger(example, example_result_dir):
68 |     runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}")
69 |     runtime_logger.setLevel(logging.DEBUG)
70 |     log_file = os.path.join(example_result_dir, "runtime.log")
71 |     runtime_logger.addHandler(logging.FileHandler(log_file))
72 |     return runtime_logger


--------------------------------------------------------------------------------
/game_agent/coast/tools/load_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import yaml
 4 | 
 5 | 
 6 | 
 7 | def save_chat_log(entry, game_name, api_model, cua):
 8 |     """
 9 |     Saves game action logs to a JSON file.
10 |     Path: json/{api_model}/{game_name}/{cua}/game_log.json
11 |     """
12 |     log_dir = os.path.join("json", cua, api_model, game_name, )
13 |     os.makedirs(log_dir, exist_ok=True)
14 | 
15 |     log_file = os.path.join(log_dir, f"game_log_{game_name}.json")
16 | 
17 |     logs = []
18 |     if os.path.exists(log_file):
19 |         with open(log_file, "r", encoding="utf-8") as f:
20 |             try:
21 |                 logs = json.load(f)
22 |             except json.JSONDecodeError:
23 |                 logs = []
24 | 
25 |     logs.append(entry)
26 |     with open(log_file, "w", encoding="utf-8") as f:
27 |         json.dump(logs, f, ensure_ascii=False, indent=4)
28 |         
29 | def load_config(yaml_path):
30 |     """
31 |     Loads a YAML configuration file and returns it as a dictionary.
32 |     """
33 |     with open(yaml_path, 'r', encoding='utf-8') as f:
34 |         return yaml.safe_load(f)
35 |     
36 | 
37 | def load_action_prompt(json_path, moduler):
38 |     """ Loads the prompt and action keys for a specific game from a JSON file """
39 |     with open(json_path, "r", encoding="utf-8") as f:
40 |         json_path = json.load(f)
41 | 
42 |     if moduler in json_path:
43 |         g = json_path[moduler]
44 |         return g["action_prompt"]
45 |     else:
46 |         raise ValueError(f"No prompt exists for game '{moduler}'.")
47 | 
48 | 
49 | def load_game_prompt(json_path, game_name, type):
50 |     """ Loads the prompt and action keys for a specific game from a JSON file """
51 |     with open(json_path, "r", encoding="utf-8") as f:
52 |         game_data = json.load(f)
53 | 
54 |     if game_name in game_data:
55 |         g = game_data[game_name]
56 |         
57 |         if type in ("game_prompt", "system_prompt"):
58 |             return g[type]
59 |         else:
60 |             raise ValueError(f"No {type} exists for game '{game_name}'.")
61 | 
62 |     else:
63 |         raise ValueError(f"No prompt exists for game '{game_name}'.")
64 | 
65 | 
66 | def load_memory(json_dir, type="episodic", n=None):
67 |     """
68 |     type: one of 'episodic', 'clue', 'task', 'reflection'
69 |     n: (Optional) If it's a list, only the last n items are returned
70 |     Returns:
71 |         - List (optionally sliced) for episodic/reflection memory
72 |         - Dict for clue/task memory
73 |     """
74 |     filename = f"{type}_memory.json"
75 |     path = os.path.join(json_dir, filename)
76 | 
77 |     if not os.path.exists(path):
78 |         return {} if type in ("task") else []
79 | 
80 |     with open(path, "r", encoding="utf-8") as f:
81 |         data = json.load(f)
82 | 
83 |     # Slice the last n items (for list types only)
84 |     if isinstance(data, list) and n is not None:
85 |         return data[-n:]
86 |     return data


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_elevator.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from eval_game.eval_utils import run_milestone
 5 | 
 6 | def parse_continue_flag(msg: str) -> str:
 7 |     """
 8 |     Returns:
 9 |       - "continue" if the message contains 'Continue: True'
10 |       - "final"    if the message contains 'Continue: Final'
11 |       - "stop"     otherwise (includes 'Continue: False' or missing)
12 |     """
13 |     if not msg:
14 |         return "stop"
15 |     msg = msg.strip()
16 |     if "Continue: True" in msg:
17 |         return "continue"
18 |     if "Continue: Final" in msg:
19 |         return "final"
20 |     return "stop"
21 | 
22 | def extract_result(conversation: list[str]) -> str | None:
23 |     """
24 |     Find the last line in the conversation that contains 'Result:'.
25 |     Returns the full line, or None if not found.
26 |     """
27 |     for msg in reversed(conversation):
28 |         if "Result:" in msg:
29 |             return msg
30 |     return None
31 | 
32 | def extract_final_stage(msg: str) -> str | None:
33 |     """
34 |     Extract the numeric 'Final Stage' value from a result string.
35 |     Example:
36 |         'Result: [Final Stage: 2, Continue: True]' -> '2'
37 |     Returns None if not found.
38 |     """
39 |     if not msg:
40 |         return None
41 |     m = re.search(r"Final Stage:\s*(\d+)", msg)
42 |     return m.group(1) if m else None
43 | 
44 | async def eval_elevator():
45 |     game = "Elevator Room Escape"
46 | 
47 |     # Load prompts
48 |     with open("milestone_prompts.json", "r", encoding="utf-8") as f:
49 |         all_data = json.load(f)
50 | 
51 |     data = all_data.get(game, {})
52 |     if not data:
53 |         print(f"❌ No data found for game: {game}")
54 |         return
55 | 
56 |     print(f"🎮 Starting evaluation for: {game}")
57 | 
58 |     # Run milestones 1..4 if present (works fine even if only 1..3 exist)
59 |     for i in range(1, 4 + 1):
60 |         prompt_key = f"milestone_prompt{i}"
61 |         prompt = data.get(prompt_key)
62 |         if not prompt:
63 |             # Skip if this milestone is not defined
64 |             continue
65 | 
66 |         print(f"\n🏁 {game} - {prompt_key}")
67 |         conversation = await run_milestone(prompt, f"{game}_{prompt_key}")
68 | 
69 |         # Extract and print ONLY the Final Stage
70 |         result_line = extract_result(conversation)
71 |         final_stage = extract_final_stage(result_line or "")
72 |         print(f"🎯 Final Stage: {final_stage if final_stage is not None else 'N/A'}")
73 | 
74 |         # Control flow based on Continue flag
75 |         status = parse_continue_flag(result_line or "")
76 |         if status == "continue":
77 |             continue  # proceed to next milestone
78 |         elif status == "final":
79 |             print(f"🏁 Final milestone reached at {prompt_key}. Evaluation ends.")
80 |             return
81 |         else:
82 |             print(f"🛑 Evaluation stopped after {prompt_key}")
83 |             return
84 | 
85 |     print("✅ All available milestones completed.")
86 | 
87 | if __name__ == "__main__":
88 |     asyncio.run(eval_elevator())


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_space.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import re
  4 | from pathlib import Path
  5 | from typing import List, Optional, Tuple
  6 | 
  7 | from eval_game.eval_utils import run_milestone
  8 | 
  9 | 
 10 | COLOR_LINE_RE = re.compile(r"Color\s*:\s*([A-Za-z\- ]+)", re.IGNORECASE)
 11 | 
 12 | CANONICAL_COLORS = {
 13 |     "red": "Red",
 14 |     "yellow": "Yellow",
 15 |     "green": "Green",
 16 |     "sky-blue": "Sky-Blue", 
 17 |     "blue": "Blue",
 18 |     "pink": "Pink",
 19 | }
 20 | 
 21 | def normalize_color_name(raw: str) -> Tuple[str, Optional[str]]:
 22 | 
 23 |     raw_trimmed = (raw or "").strip()
 24 | 
 25 | 
 26 |     s = raw_trimmed.lower().strip()
 27 | 
 28 | 
 29 |     s = re.sub(r"\s+", " ", s)         
 30 |     s = s.replace(" ", "-")             
 31 | 
 32 |     s = re.sub(r"[^a-z\-]", "", s)
 33 | 
 34 |     canonical = CANONICAL_COLORS.get(s)
 35 |     return raw_trimmed, canonical
 36 | 
 37 | def extract_color(conversation: List[str]) -> Optional[str]:
 38 | 
 39 |     for msg in reversed(conversation or []):
 40 |         m = COLOR_LINE_RE.search(msg)
 41 |         if m:
 42 |             raw = m.group(1)
 43 |             _raw_trimmed, canonical = normalize_color_name(raw)
 44 |             if canonical:
 45 |                 return canonical
 46 | 
 47 |     return None
 48 | 
 49 | # =========================
 50 | # Evalutator
 51 | # =========================
 52 | async def eval_space():
 53 |     game = "Space Museum Escape"
 54 | 
 55 |     # milestone_prompts.json
 56 |     with open("milestone_prompts.json", "r", encoding="utf-8") as f:
 57 |         all_data = json.load(f)
 58 | 
 59 |     data = all_data.get(game, {})
 60 |     if not data:
 61 |         print(f"❌ No data found for game: {game}")
 62 |         return
 63 | 
 64 |     print(f"🎮 Running evaluation for: {game}")
 65 | 
 66 |     # 1) Instruction
 67 |     if "Instruction" in data:
 68 |         _ = await run_milestone(data["Instruction"], f"{game}_Instruction")
 69 | 
 70 |     # 2) milestone_prompt1
 71 |     prompt_key = "milestone_prompt1"
 72 |     if prompt_key not in data:
 73 |         print(f"❌ No milestone data found for {game} / {prompt_key}")
 74 |         return
 75 | 
 76 |     conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 77 |     if conversation is None:
 78 |         print("⚠️ No conversation returned from agent.")
 79 |         return
 80 | 
 81 |     color = extract_color(conversation)
 82 | 
 83 |     if color is not None:
 84 |         print(f"Color: {color}")
 85 | 
 86 |         result_obj = {
 87 |             "game": "space_museum_escape",
 88 |             "result": f"color: {color}",
 89 |             "color": color,
 90 |         }
 91 | 
 92 |         Path("results").mkdir(exist_ok=True)
 93 |         with open("results/result_space_museum.json", "w", encoding="utf-8") as f:
 94 |             json.dump(result_obj, f, indent=2, ensure_ascii=False)
 95 |         print("📝 Result saved to results/result_space_museum.json")
 96 |     else:
 97 |         print("⚠️ Couldn't find Color in result.")
 98 | 
 99 | if __name__ == "__main__":
100 |     asyncio.run(eval_space())


--------------------------------------------------------------------------------
/game_agent/cradle/tools/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import base64
 3 | import re
 4 | 
 5 | def encode_image(image_path):
 6 |     """
 7 |     Read a file from disk and return its contents as a base64-encoded string.
 8 |     """
 9 |     with open(image_path, "rb") as image_file:
10 |         return base64.b64encode(image_file.read()).decode("utf-8")
11 |     
12 |     
13 | def encode_images_to_base64(image_paths):
14 |     encoded = []
15 |     for path in image_paths:
16 |         try:
17 |             with open(path, "rb") as f:
18 |                 encoded_str = base64.b64encode(f.read()).decode("utf-8")
19 |                 encoded.append(encoded_str)
20 |         except Exception as e:
21 |             print(f"[WARN] Failed to encode image {path}: {e}")
22 |     return encoded
23 | 
24 | def extract_python_code(content):
25 |     if not content:
26 |         print("[ERROR] extract_python_code() received empty content")
27 |         return "", ""
28 | 
29 |     print(f"[DEBUG] Raw content received:\n{content}\n")
30 | 
31 |     # 🔹 Extract only the Python code that comes after the "code" key
32 |     match = re.search(r'"code"\s*:\s*"""\s*(.*?)\s*"""', content, re.DOTALL)
33 |     action_match = re.search(r'"action":\s*"([^"]+)"', content)  # 🔹 Find action value
34 | 
35 |     action_text = action_match.group(1) if action_match else ""  # 🔹 Extract string value only
36 | 
37 |     if match:
38 |         code_content = match.group(1)  # Get only the Python code section
39 | 
40 |         # 🔹 Remove comments (multiline """ """ and single-line # comments)
41 |         code_content = re.sub(r'""".*?"""', '', code_content, flags=re.DOTALL).strip()
42 |         code_content = re.sub(r'^\s*#.*$', '', code_content, flags=re.MULTILINE).strip()
43 | 
44 |         print(f"[DEBUG] Extracted Code:\n{code_content}\n")
45 |         print(f"[DEBUG] Extracted Action:\n{action_text}\n")  # 🔹 Print action value
46 |         return code_content, action_text
47 | 
48 |     print("[ERROR] No Python code found in content.")
49 |     return "", action_text  # 🔹 Always return action_text as well
50 | 
51 | 
52 | def extract_action_change(content: str) -> dict:
53 |     """
54 |     Extracts whether an action succeeded and the reason from a GPT response.
55 | 
56 |     Args:
57 |         content (str): GPT response
58 | 
59 |     Returns:
60 |         dict: {
61 |             "success": True/False,
62 |             "explanation": "Explanation of the change caused by the action"
63 |         }
64 |     """
65 |     if not content:
66 |         print("⚠️ content is empty.")
67 |         return {"success": False, "explanation": "No content"}
68 | 
69 |     print(f"[DEBUG] Raw GPT response:\n{content}\n")
70 | 
71 |     # Extract success status
72 |     match = re.search(r"Success_Action:\s*(True|False)", content, re.IGNORECASE)
73 |     success = match.group(1).lower() == "true" if match else False
74 | 
75 |     # Extract explanation (text following Reason:)
76 |     explanation_match = re.search(r"Reason:\s*(.+)", content, re.IGNORECASE | re.DOTALL)
77 |     explanation = explanation_match.group(1).strip() if explanation_match else "No explanation of change"
78 | 
79 |     return {
80 |         "success": success,
81 |         "explanation": explanation
82 |     }


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/computers/computer_use.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import time
 3 | import base64
 4 | from typing import List, Dict, Literal
 5 | from io import BytesIO
 6 | from PIL import Image, ImageDraw, ImageFont
 7 | import pyautogui
 8 | from .computer import Computer
 9 | 
10 | class LocalDesktopComputer(Computer):
11 |     def __init__(self, max_actions: int = 3):
12 |         os_name = platform.system().lower()
13 |         if "darwin" in os_name:
14 |             self._environment = "mac"
15 |         elif "linux" in os_name:
16 |             self._environment = "linux"
17 |         else:
18 |             self._environment = "windows"
19 |         self._dimensions = pyautogui.size()
20 | 
21 |         self._action_count = 0
22 |         self._max_actions = max_actions
23 |         self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"]
24 | 
25 |     @property
26 |     def environment(self) -> Literal["windows", "mac", "linux"]:
27 |         return self._environment
28 | 
29 |     @property
30 |     def dimensions(self) -> tuple[int, int]:
31 |         return self._dimensions
32 | 
33 |     @property
34 |     def action_count(self) -> int:
35 |         return self._action_count
36 | 
37 |     @property
38 |     def max_actions(self) -> int:
39 |         return self._max_actions
40 | 
41 |     def _maybe_count(self, action_name: str):
42 |         if action_name in self._countable:
43 |             self._action_count += 1
44 |             print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}")
45 | 
46 |     def screenshot(self) -> str:
47 |         img = pyautogui.screenshot()
48 |         buffer = BytesIO()
49 |         img.save(buffer, format="PNG")
50 |         return base64.b64encode(buffer.getvalue()).decode("utf-8")
51 | 
52 |     def click(self, x: int, y: int, button: str = "left") -> None:
53 |         self._maybe_count("click")
54 |         pyautogui.click(x=x, y=y, button=button)
55 | 
56 |     def double_click(self, x: int, y: int) -> None:
57 |         self._maybe_count("double_click")
58 |         pyautogui.doubleClick(x=x, y=y)
59 | 
60 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
61 |         self._maybe_count("scroll")
62 |         pyautogui.moveTo(x, y)
63 |         pyautogui.scroll(scroll_y)
64 | 
65 |     def type(self, text: str) -> None:
66 |         self._maybe_count("type")
67 |         pyautogui.write(text)
68 | 
69 |     def wait(self, ms: int = 1000) -> None:
70 |         time.sleep(ms / 1000)
71 | 
72 |     def move(self, x: int, y: int) -> None:
73 |         pyautogui.moveTo(x, y)
74 | 
75 |     def keypress(self, keys: List[str]) -> None:
76 |         self._maybe_count("keypress")
77 |         for key in keys:
78 |             pyautogui.keyDown(key)
79 |         for key in reversed(keys):
80 |             pyautogui.keyUp(key)
81 | 
82 |     def drag(self, path: List[Dict[str, int]]) -> None:
83 |         self._maybe_count("drag")
84 |         if not path:
85 |             return
86 |         pyautogui.moveTo(path[0]["x"], path[0]["y"])
87 |         pyautogui.mouseDown()
88 |         for point in path[1:]:
89 |             pyautogui.moveTo(point["x"], point["y"])
90 |         pyautogui.mouseUp()
91 | 
92 |     def get_current_url(self) -> str:
93 |         return "file://local-desktop"
94 | 
95 |     def reset_action_counter(self):
96 |         self._action_count = 0


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_sherlock2.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | import re
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | from eval_game.eval_utils import run_milestone
 8 | 
 9 | def extract_found_items(conversation: List[str]) -> int:
10 |     for msg in reversed(conversation):
11 |         match = re.search(r"#Found Items:\s*(\d+)", msg, re.IGNORECASE)
12 |         if match:
13 |             return int(match.group(1))
14 |     return -1
15 | 
16 | def extract_boolean_result(conversation: List[str]) -> bool | None:
17 |     for msg in reversed(conversation):
18 |         match = re.search(r"Result:\s*(True|False)", msg, re.IGNORECASE)
19 |         if match:
20 |             return match.group(1).lower() == "true"
21 |     return None
22 | 
23 | async def eval_sherlock2():
24 |     game = "Sherlock Holmes 2"
25 | 
26 |     with open("milestone_prompts.json", "r") as f:
27 |         all_data = json.load(f)
28 | 
29 |     data = all_data.get(game, {})
30 |     if not data:
31 |         print(f"❌ No data found for game: {game}")
32 |         return
33 | 
34 |     result_obj = {
35 |         "game": "sherlock_holmes_2",
36 |         "results": {},
37 |         "failed_at": None
38 |     }
39 | 
40 |     print(f"🔍 Running evaluation for: {game}")
41 | 
42 |     try:
43 |         # === milestone_prompt1 (Always run, even if it fails) ===
44 |         key = "milestone_prompt1"
45 |         print(f"\n🏁 {key}")
46 |         convo1 = await run_milestone(data[key], f"{game}_{key}")
47 |         found_items = extract_found_items(convo1)
48 | 
49 |         result_obj["results"]["found_items"] = found_items
50 |         print(f"🧾 Found Items: {found_items}")
51 | 
52 |         # === milestone_prompt2 (fails => stop) ===
53 |         key = "milestone_prompt2"
54 |         print(f"\n🏁 {key}")
55 |         convo2 = await run_milestone(data[key], f"{game}_{key}")
56 |         result2 = extract_boolean_result(convo2)
57 | 
58 |         result_obj["results"]["fire_alarm_open"] = result2
59 |         print(f"🚨 Fire Alarm Open (2F): {result2}")
60 | 
61 |         if result2 is not True:
62 |             result_obj["failed_at"] = key
63 |             print(f"🛑 Failed at {key}: Fire alarm is not open.")
64 |             return
65 | 
66 |         # === milestone_prompt3 (fails => stop) ===
67 |         key = "milestone_prompt3"
68 |         print(f"\n🏁 {key}")
69 |         convo3 = await run_milestone(data[key], f"{game}_{key}")
70 |         result3 = extract_boolean_result(convo3)
71 | 
72 |         result_obj["results"]["leaves_burn"] = result3
73 |         print(f"🔥 Leaves Burning (1F): {result3}")
74 | 
75 |         if result3 is not True:
76 |             result_obj["failed_at"] = key
77 |             print(f"🛑 Failed at {key}: Leaves are not burning.")
78 |             return
79 | 
80 |         # ✅ All passed
81 |         print("🎉 All milestones passed successfully!")
82 | 
83 |     except KeyboardInterrupt:
84 |         print("\n⚠️ Evaluation interrupted by user.")
85 | 
86 |     finally:
87 |         save_results(result_obj)
88 | 
89 | def save_results(result_obj):
90 |     Path("results").mkdir(exist_ok=True)
91 |     with open("results/result_sherlock2.json", "w") as f:
92 |         json.dump(result_obj, f, indent=2)
93 |     print("📝 Result saved to results/result_sherlock2.json")
94 | 
95 | if __name__ == "__main__":
96 |     asyncio.run(eval_sherlock2())


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/computers/computer_use.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import time
 3 | import base64
 4 | from typing import List, Dict, Literal
 5 | from io import BytesIO
 6 | from PIL import Image, ImageDraw, ImageFont
 7 | import pyautogui
 8 | from .computer import Computer
 9 | 
10 | class LocalDesktopComputer(Computer):
11 |     def __init__(self, max_actions: int = 3):
12 |         os_name = platform.system().lower()
13 |         if "darwin" in os_name:
14 |             self._environment = "mac"
15 |         elif "linux" in os_name:
16 |             self._environment = "linux"
17 |         else:
18 |             self._environment = "windows"
19 |         self._dimensions = pyautogui.size()
20 | 
21 |         self._action_count = 0
22 |         self._max_actions = max_actions
23 |         self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"]
24 | 
25 |     @property
26 |     def environment(self) -> Literal["windows", "mac", "linux"]:
27 |         return self._environment
28 | 
29 |     @property
30 |     def dimensions(self) -> tuple[int, int]:
31 |         return self._dimensions
32 | 
33 |     @property
34 |     def action_count(self) -> int:
35 |         return self._action_count
36 | 
37 |     @property
38 |     def max_actions(self) -> int:
39 |         return self._max_actions
40 | 
41 |     def _maybe_count(self, action_name: str):
42 |         if action_name in self._countable:
43 |             self._action_count += 1
44 |             print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}")
45 | 
46 |     def screenshot(self) -> str:
47 |         img = pyautogui.screenshot()
48 |         buffer = BytesIO()
49 |         img.save(buffer, format="PNG")
50 |         return base64.b64encode(buffer.getvalue()).decode("utf-8")
51 | 
52 |     def click(self, x: int, y: int, button: str = "left") -> None:
53 |         self._maybe_count("click")
54 |         pyautogui.click(x=x, y=y, button=button)
55 | 
56 |     def double_click(self, x: int, y: int) -> None:
57 |         self._maybe_count("double_click")
58 |         pyautogui.doubleClick(x=x, y=y)
59 | 
60 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
61 |         self._maybe_count("scroll")
62 |         pyautogui.moveTo(x, y)
63 |         pyautogui.scroll(scroll_y)
64 | 
65 |     def type(self, text: str) -> None:
66 |         self._maybe_count("type")
67 |         pyautogui.write(text)
68 | 
69 |     def wait(self, ms: int = 1000) -> None:
70 |         time.sleep(ms / 1000)
71 | 
72 |     def move(self, x: int, y: int) -> None:
73 |         pyautogui.moveTo(x, y)
74 | 
75 |     def keypress(self, keys: List[str]) -> None:
76 |         self._maybe_count("keypress")
77 |         for key in keys:
78 |             pyautogui.keyDown(key)
79 |         for key in reversed(keys):
80 |             pyautogui.keyUp(key)
81 | 
82 |     def drag(self, path: List[Dict[str, int]]) -> None:
83 |         self._maybe_count("drag")
84 |         if not path:
85 |             return
86 |         pyautogui.moveTo(path[0]["x"], path[0]["y"])
87 |         pyautogui.mouseDown()
88 |         for point in path[1:]:
89 |             pyautogui.moveTo(point["x"], point["y"])
90 |         pyautogui.mouseUp()
91 | 
92 |     def get_current_url(self) -> str:
93 |         return "file://local-desktop"
94 | 
95 |     def reset_action_counter(self):
96 |         self._action_count = 0


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gui_grounding/computer/computer_use.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import time
 3 | import base64
 4 | from typing import List, Dict, Literal
 5 | from io import BytesIO
 6 | from PIL import Image, ImageDraw, ImageFont
 7 | import pyautogui
 8 | from .computer import Computer
 9 | 
10 | class LocalDesktopComputer(Computer):
11 |     def __init__(self, max_actions: int = 3):
12 |         os_name = platform.system().lower()
13 |         if "darwin" in os_name:
14 |             self._environment = "mac"
15 |         elif "linux" in os_name:
16 |             self._environment = "linux"
17 |         else:
18 |             self._environment = "windows"
19 |         self._dimensions = pyautogui.size()
20 | 
21 |         self._action_count = 0
22 |         self._max_actions = max_actions
23 |         self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"]
24 | 
25 |     @property
26 |     def environment(self) -> Literal["windows", "mac", "linux"]:
27 |         return self._environment
28 | 
29 |     @property
30 |     def dimensions(self) -> tuple[int, int]:
31 |         return self._dimensions
32 | 
33 |     @property
34 |     def action_count(self) -> int:
35 |         return self._action_count
36 | 
37 |     @property
38 |     def max_actions(self) -> int:
39 |         return self._max_actions
40 | 
41 |     def _maybe_count(self, action_name: str):
42 |         if action_name in self._countable:
43 |             self._action_count += 1
44 |             print(f"⬆️ 액션 카운터 증가: {self._action_count}/{self._max_actions}")
45 | 
46 |     def screenshot(self) -> str:
47 |         img = pyautogui.screenshot()
48 |         buffer = BytesIO()
49 |         img.save(buffer, format="PNG")
50 |         return base64.b64encode(buffer.getvalue()).decode("utf-8")
51 | 
52 |     def click(self, x: int, y: int, button: str = "left") -> None:
53 |         self._maybe_count("click")
54 |         pyautogui.click(x=x, y=y, button=button)
55 | 
56 |     def double_click(self, x: int, y: int) -> None:
57 |         self._maybe_count("double_click")
58 |         pyautogui.doubleClick(x=x, y=y)
59 | 
60 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
61 |         self._maybe_count("scroll")
62 |         pyautogui.moveTo(x, y)
63 |         pyautogui.scroll(scroll_y)
64 | 
65 |     def type(self, text: str) -> None:
66 |         self._maybe_count("type")
67 |         pyautogui.write(text)
68 | 
69 |     def wait(self, ms: int = 1000) -> None:
70 |         time.sleep(ms / 1000)
71 | 
72 |     def move(self, x: int, y: int) -> None:
73 |         pyautogui.moveTo(x, y)
74 | 
75 |     def keypress(self, keys: List[str]) -> None:
76 |         self._maybe_count("keypress")
77 |         for key in keys:
78 |             pyautogui.keyDown(key)
79 |         for key in reversed(keys):
80 |             pyautogui.keyUp(key)
81 | 
82 |     def drag(self, path: List[Dict[str, int]]) -> None:
83 |         self._maybe_count("drag")
84 |         if not path:
85 |             return
86 |         pyautogui.moveTo(path[0]["x"], path[0]["y"])
87 |         pyautogui.mouseDown()
88 |         for point in path[1:]:
89 |             pyautogui.moveTo(point["x"], point["y"])
90 |         pyautogui.mouseUp()
91 | 
92 |     def get_current_url(self) -> str:
93 |         return "file://local-desktop"
94 | 
95 |     def reset_action_counter(self):
96 |         self._action_count = 0
97 | 


--------------------------------------------------------------------------------
/game_agent/cradle/gui_grounding/computer/computer_use.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import time
 3 | import base64
 4 | from typing import List, Dict, Literal
 5 | from io import BytesIO
 6 | from PIL import Image, ImageDraw, ImageFont
 7 | import pyautogui
 8 | from .computer import Computer
 9 | 
10 | class LocalDesktopComputer(Computer):
11 |     def __init__(self, max_actions: int = 3):
12 |         os_name = platform.system().lower()
13 |         if "darwin" in os_name:
14 |             self._environment = "mac"
15 |         elif "linux" in os_name:
16 |             self._environment = "linux"
17 |         else:
18 |             self._environment = "windows"
19 |         self._dimensions = pyautogui.size()
20 | 
21 |         self._action_count = 0
22 |         self._max_actions = max_actions
23 |         self._countable = ["click", "double_click", "scroll", "type", "keypress", "drag"]
24 | 
25 |     @property
26 |     def environment(self) -> Literal["windows", "mac", "linux"]:
27 |         return self._environment
28 | 
29 |     @property
30 |     def dimensions(self) -> tuple[int, int]:
31 |         return self._dimensions
32 | 
33 |     @property
34 |     def action_count(self) -> int:
35 |         return self._action_count
36 | 
37 |     @property
38 |     def max_actions(self) -> int:
39 |         return self._max_actions
40 | 
41 |     def _maybe_count(self, action_name: str):
42 |         if action_name in self._countable:
43 |             self._action_count += 1
44 |             print(f"⬆️ Add Action Counter: {self._action_count}/{self._max_actions}")
45 | 
46 |     def screenshot(self) -> str:
47 |         img = pyautogui.screenshot()
48 |         buffer = BytesIO()
49 |         img.save(buffer, format="PNG")
50 |         return base64.b64encode(buffer.getvalue()).decode("utf-8")
51 | 
52 |     def click(self, x: int, y: int, button: str = "left") -> None:
53 |         self._maybe_count("click")
54 |         pyautogui.click(x=x, y=y, button=button)
55 | 
56 |     def double_click(self, x: int, y: int) -> None:
57 |         self._maybe_count("double_click")
58 |         pyautogui.doubleClick(x=x, y=y)
59 | 
60 |     def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
61 |         self._maybe_count("scroll")
62 |         pyautogui.moveTo(x, y)
63 |         pyautogui.scroll(scroll_y)
64 | 
65 |     def type(self, text: str) -> None:
66 |         self._maybe_count("type")
67 |         pyautogui.write(text)
68 | 
69 |     def wait(self, ms: int = 1000) -> None:
70 |         time.sleep(ms / 1000)
71 | 
72 |     def move(self, x: int, y: int) -> None:
73 |         pyautogui.moveTo(x, y)
74 | 
75 |     def keypress(self, keys: List[str]) -> None:
76 |         self._maybe_count("keypress")
77 |         for key in keys:
78 |             pyautogui.keyDown(key)
79 |         for key in reversed(keys):
80 |             pyautogui.keyUp(key)
81 | 
82 |     def drag(self, path: List[Dict[str, int]]) -> None:
83 |         self._maybe_count("drag")
84 |         if not path:
85 |             return
86 |         pyautogui.moveTo(path[0]["x"], path[0]["y"])
87 |         pyautogui.mouseDown()
88 |         for point in path[1:]:
89 |             pyautogui.moveTo(point["x"], point["y"])
90 |         pyautogui.mouseUp()
91 | 
92 |     def get_current_url(self) -> str:
93 |         return "file://local-desktop"
94 | 
95 |     def reset_action_counter(self):
96 |         self._action_count = 0
97 | 


--------------------------------------------------------------------------------
/game_agent/coast/gui_agent/gpt_cua/simple_cua_loop.py:
--------------------------------------------------------------------------------
 1 | # simple_cua_loop.py or gpt_cua/runner.py
 2 | from .computers import LocalDesktopComputer
 3 | from .utils import create_response, check_blocklisted_url
 4 | 
 5 | def acknowledge_safety_check_callback(message: str) -> bool:
 6 |     response = input(f"Safety Check Warning: {message}\nProceed? (y/n): ").lower()
 7 |     return response.strip() == "y"
 8 | 
 9 | def handle_item(item, computer: LocalDesktopComputer):
10 |     if item["type"] == "message":
11 |         print(item["content"][0]["text"])
12 | 
13 |     if item["type"] == "computer_call":
14 |         action = item["action"]
15 |         action_type = action["type"]
16 |         action_args = {k: v for k, v in action.items() if k != "type"}
17 |         print(f"🖱️ {action_type}({action_args})")
18 | 
19 |         getattr(computer, action_type)(**action_args)
20 | 
21 |         print(f"🎯 액션 카운트: {computer.action_count}/{computer.max_actions}")
22 | 
23 |         screenshot_base64 = computer.screenshot()
24 | 
25 |         checks = item.get("pending_safety_checks", [])
26 |         for check in checks:
27 |             if not acknowledge_safety_check_callback(check["message"]):
28 |                 raise ValueError(f"Safety check failed: {check['message']}")
29 | 
30 |         output = {
31 |             "type": "computer_call_output",
32 |             "call_id": item["call_id"],
33 |             "acknowledged_safety_checks": checks,
34 |             "output": {
35 |                 "type": "input_image",
36 |                 "image_url": f"data:image/png;base64,{screenshot_base64}",
37 |             },
38 |         }
39 | 
40 |         if computer.environment == "browser":
41 |             current_url = computer.get_current_url()
42 |             output["output"]["current_url"] = current_url
43 |             check_blocklisted_url(current_url)
44 | 
45 |         return [output]
46 | 
47 |     return []
48 | 
49 | import time
50 | 
51 | def main_gpt_operator(user_prompt=None, max_retries=300):
52 |     computer = LocalDesktopComputer(max_actions=300)
53 |     tools = [{
54 |         "type": "computer-preview",
55 |         "display_width": computer.dimensions[0],
56 |         "display_height": computer.dimensions[1],
57 |         "environment": computer.environment,
58 |     }]
59 | 
60 |     items = []
61 |     if user_prompt:
62 |         items.append({"role": "user", "content": user_prompt})
63 |     else:
64 |         user_input = input("> ")
65 |         items.append({"role": "user", "content": user_input})
66 | 
67 |     while True:
68 |         for attempt in range(max_retries):
69 |             response = create_response(
70 |                 model="computer-use-preview",
71 |                 input=items,
72 |                 tools=tools,
73 |                 truncation="auto",
74 |             )
75 | 
76 |             if "output" in response:
77 |                 break 
78 |             else:
79 |                 print(f"[Retry {attempt+1}/{max_retries}] No output from model. Retrying...")
80 |                 time.sleep(1)
81 | 
82 |         else:
83 |             # fail by max_retries
84 |             raise ValueError("No output from model after multiple retries")
85 | 
86 |         items += response["output"]
87 | 
88 |         for item in response["output"]:
89 |             items += handle_item(item, computer)
90 | 
91 |         if items[-1].get("role") == "assistant":
92 |             break
93 | 
94 |     return computer.action_count


--------------------------------------------------------------------------------
/game_agent/cradle/gpt_cua/simple_cua_loop.py:
--------------------------------------------------------------------------------
 1 | # simple_cua_loop.py or gpt_cua/runner.py
 2 | from gpt_cua.computers import LocalDesktopComputer
 3 | from gpt_cua.utils import create_response, check_blocklisted_url
 4 | 
 5 | def acknowledge_safety_check_callback(message: str) -> bool:
 6 |     response = input(f"Safety Check Warning: {message}\nProceed? (y/n): ").lower()
 7 |     return response.strip() == "y"
 8 | 
 9 | def handle_item(item, computer: LocalDesktopComputer):
10 |     if item["type"] == "message":
11 |         print(item["content"][0]["text"])
12 | 
13 |     if item["type"] == "computer_call":
14 |         action = item["action"]
15 |         action_type = action["type"]
16 |         action_args = {k: v for k, v in action.items() if k != "type"}
17 |         print(f"🖱️ {action_type}({action_args})")
18 | 
19 |         getattr(computer, action_type)(**action_args)
20 | 
21 |         print(f"🎯 액션 카운트: {computer.action_count}/{computer.max_actions}")
22 | 
23 |         screenshot_base64 = computer.screenshot()
24 | 
25 |         checks = item.get("pending_safety_checks", [])
26 |         for check in checks:
27 |             if not acknowledge_safety_check_callback(check["message"]):
28 |                 raise ValueError(f"Safety check failed: {check['message']}")
29 | 
30 |         output = {
31 |             "type": "computer_call_output",
32 |             "call_id": item["call_id"],
33 |             "acknowledged_safety_checks": checks,
34 |             "output": {
35 |                 "type": "input_image",
36 |                 "image_url": f"data:image/png;base64,{screenshot_base64}",
37 |             },
38 |         }
39 | 
40 |         if computer.environment == "browser":
41 |             current_url = computer.get_current_url()
42 |             output["output"]["current_url"] = current_url
43 |             check_blocklisted_url(current_url)
44 | 
45 |         return [output]
46 | 
47 |     return []
48 | 
49 | import time
50 | 
51 | def main_gpt_cua(prompt_text=None, max_retries=300):
52 |     computer = LocalDesktopComputer(max_actions=300)
53 |     tools = [{
54 |         "type": "computer-preview",
55 |         "display_width": computer.dimensions[0],
56 |         "display_height": computer.dimensions[1],
57 |         "environment": computer.environment,
58 |     }]
59 | 
60 |     items = []
61 |     if prompt_text:
62 |         items.append({"role": "user", "content": prompt_text})
63 |     else:
64 |         user_input = input("> ")
65 |         items.append({"role": "user", "content": user_input})
66 | 
67 |     while True:
68 |         for attempt in range(max_retries):
69 |             response = create_response(
70 |                 model="computer-use-preview",
71 |                 input=items,
72 |                 tools=tools,
73 |                 truncation="auto",
74 |             )
75 | 
76 |             if "output" in response:
77 |                 break  # 성공했으면 루프 탈출
78 |             else:
79 |                 print(f"[Retry {attempt+1}/{max_retries}] No output from model. Retrying...")
80 |                 time.sleep(1)  # 잠깐 대기 후 재시도
81 | 
82 |         else:
83 |             # max_retries번 모두 실패한 경우
84 |             raise ValueError("No output from model after multiple retries")
85 | 
86 |         items += response["output"]
87 | 
88 |         for item in response["output"]:
89 |             items += handle_item(item, computer)
90 | 
91 |         if items[-1].get("role") == "assistant":
92 |             break
93 | 
94 |     return computer.action_count


--------------------------------------------------------------------------------
/evaluator/judge/vlm/tools/serving/api_providers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dotenv
  3 | from openai import OpenAI
  4 | import anthropic
  5 | import google.generativeai as genai
  6 | 
  7 | # Load .env file
  8 | dotenv.load_dotenv()
  9 | 
 10 | ## Distinguish between using images and not using images
 11 | def openai_completion(system_prompt, model_name, base64_images, prompt):
 12 |     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 13 |     
 14 |     # Basic message structure
 15 |     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": []}]
 16 | 
 17 |     # Add multiple images if available
 18 |     if base64_images:
 19 |         for base64_image in base64_images:
 20 |             messages[1]["content"].append({
 21 |                 "type": "image_url",
 22 |                 "image_url": {"url": f"data:image/png;base64,{base64_image}"},
 23 |             })
 24 |     
 25 |     # Add text prompt
 26 |     messages[1]["content"].append({"type": "text", "text": prompt})
 27 | 
 28 |     response = client.chat.completions.create(
 29 |         model=model_name,
 30 |         messages=messages,
 31 |         temperature=0,
 32 |         max_tokens=1024,
 33 |     )
 34 |     return response.choices[0].message.content
 35 | 
 36 | def anthropic_completion(system_prompt, model_name, base64_images, prompt):
 37 |     client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 38 |     
 39 |     # Construct user message content
 40 |     user_content = []
 41 |     
 42 |     # Add images if available
 43 |     if base64_images:
 44 |         for base64_image in base64_images:
 45 |             user_content.append({
 46 |                 "type": "image",
 47 |                 "source": {
 48 |                     "type": "base64",
 49 |                     "media_type": "image/png",
 50 |                     "data": base64_image,
 51 |                 }
 52 |             })
 53 |     
 54 |     # Add text prompt
 55 |     user_content.append({
 56 |         "type": "text",
 57 |         "text": prompt
 58 |     })
 59 | 
 60 |     # Create stream and process response
 61 |     with client.messages.stream(
 62 |         max_tokens=1024,
 63 |         system=system_prompt,  # system prompt is passed as a separate parameter
 64 |         messages=[
 65 |             {
 66 |                 "role": "user",
 67 |                 "content": user_content
 68 |             }
 69 |         ],
 70 |         temperature=0,
 71 |         model=model_name,
 72 |     ) as stream:
 73 |         partial_chunks = []
 74 |         for chunk in stream.text_stream:
 75 |             partial_chunks.append(chunk)
 76 |     
 77 |     return "".join(partial_chunks)
 78 | 
 79 | def gemini_completion(system_prompt, model_name, base64_images, prompt):
 80 |     genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 81 |     model = genai.GenerativeModel(model_name=model_name)
 82 | 
 83 |     # Basic message structure
 84 |     messages = [
 85 |         {"role": "system", "text": system_prompt}
 86 |     ]
 87 | 
 88 |     # Add multiple images if available
 89 |     if base64_images:
 90 |         for base64_image in base64_images:
 91 |             messages.append({
 92 |                 "mime_type": "image/png",
 93 |                 "data": base64_image,
 94 |             })
 95 |     
 96 |     # Add text prompt
 97 |     messages.append(prompt)
 98 |     
 99 |     try:
100 |         response = model.generate_content(messages)
101 |         return response.text
102 |     except Exception as e:
103 |         print(f"Error: {e}")
104 |         return None


--------------------------------------------------------------------------------
/game_agent/coast/api/serving/api_providers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dotenv
  3 | from openai import OpenAI
  4 | import anthropic
  5 | import google.generativeai as genai
  6 | 
  7 | # .env load
  8 | dotenv.load_dotenv()
  9 | 
 10 | ## Use Image
 11 | def openai_completion(system_prompt, model_name, base64_images, prompt):
 12 |     client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 13 |     
 14 |     # Message
 15 |     messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": []}]
 16 | 
 17 |     # #image > 1
 18 |     if base64_images:
 19 |         for base64_image in base64_images:
 20 |             messages[1]["content"].append({
 21 |                 "type": "image_url",
 22 |                 "image_url": {"url": f"data:image/png;base64,{base64_image}"},
 23 |             })
 24 |     
 25 |     # adding text prompt
 26 |     messages[1]["content"].append({"type": "text", "text": prompt})
 27 |     
 28 |     if model_name == "o4-mini-2025-04-16":
 29 |         response = client.chat.completions.create(
 30 |             model=model_name,
 31 |             messages=messages,
 32 |             max_completion_tokens=3000,
 33 |         )
 34 |     else:
 35 |         response = client.chat.completions.create(
 36 |             model=model_name,
 37 |             messages=messages,
 38 |             temperature=0,
 39 |             max_tokens=1024,
 40 |         )
 41 |         
 42 |     return response.choices[0].message.content
 43 | 
 44 | def anthropic_completion(system_prompt, model_name, base64_images, prompt):
 45 |     client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 46 |     
 47 |     user_content = []
 48 |     
 49 |     # adding aditional image
 50 |     if base64_images:
 51 |         for base64_image in base64_images:
 52 |             user_content.append({
 53 |                 "type": "image",
 54 |                 "source": {
 55 |                     "type": "base64",
 56 |                     "media_type": "image/png",
 57 |                     "data": base64_image,
 58 |                 }
 59 |             })
 60 |     
 61 |     # adding text prompt
 62 |     user_content.append({
 63 |         "type": "text",
 64 |         "text": prompt
 65 |     })
 66 | 
 67 |     with client.messages.stream(
 68 |         max_tokens=2048,
 69 |         system=system_prompt, 
 70 |         messages=[
 71 |             {
 72 |                 "role": "user",
 73 |                 "content": user_content
 74 |             }
 75 |         ],
 76 |         temperature=0,
 77 |         model=model_name,
 78 |     ) as stream:
 79 |         partial_chunks = []
 80 |         for chunk in stream.text_stream:
 81 |             partial_chunks.append(chunk)
 82 |     
 83 |     return "".join(partial_chunks)
 84 | 
 85 | def gemini_completion(system_prompt, model_name, base64_images, prompt):
 86 |     genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 87 |     model = genai.GenerativeModel(model_name=model_name)
 88 | 
 89 |     # message
 90 |     messages = [
 91 |         {"role": "system", "text": system_prompt}
 92 |     ]
 93 | 
 94 |     # adding aditional image
 95 |     if base64_images:
 96 |         for base64_image in base64_images:
 97 |             messages.append({
 98 |                 "mime_type": "image/png",
 99 |                 "data": base64_image,
100 |             })
101 |     
102 |     # adding text prompt
103 |     messages.append(prompt)
104 |     
105 |     try:
106 |         response = model.generate_content(messages)
107 |         return response.text
108 |     except Exception as e:
109 |         print(f"Error: {e}")
110 |         return None
111 | 


--------------------------------------------------------------------------------
/game_agent/gpt_operator/simple_cua_loop.py:
--------------------------------------------------------------------------------
 1 | from computers import Computer
 2 | from computers import LocalPlaywrightComputer
 3 | from utils import create_response, check_blocklisted_url
 4 | 
 5 | 
 6 | def acknowledge_safety_check_callback(message: str) -> bool:
 7 |     response = input(
 8 |         f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
 9 |     ).lower()
10 |     return response.strip() == "y"
11 | 
12 | 
13 | def handle_item(item, computer: Computer):
14 |     """Handle each item; may cause a computer action + screenshot."""
15 |     if item["type"] == "message":  # print messages
16 |         print(item["content"][0]["text"])
17 | 
18 |     if item["type"] == "computer_call":  # perform computer actions
19 |         action = item["action"]
20 |         action_type = action["type"]
21 |         action_args = {k: v for k, v in action.items() if k != "type"}
22 |         print(f"{action_type}({action_args})")
23 | 
24 |         # give our computer environment action to perform
25 |         getattr(computer, action_type)(**action_args)
26 | 
27 |         screenshot_base64 = computer.screenshot()
28 | 
29 |         pending_checks = item.get("pending_safety_checks", [])
30 |         for check in pending_checks:
31 |             if not acknowledge_safety_check_callback(check["message"]):
32 |                 raise ValueError(f"Safety check failed: {check['message']}")
33 | 
34 |         # return value informs model of the latest screenshot
35 |         call_output = {
36 |             "type": "computer_call_output",
37 |             "call_id": item["call_id"],
38 |             "acknowledged_safety_checks": pending_checks,
39 |             "output": {
40 |                 "type": "input_image",
41 |                 "image_url": f"data:image/png;base64,{screenshot_base64}",
42 |             },
43 |         }
44 | 
45 |         # additional URL safety checks for browser environments
46 |         if computer.environment == "browser":
47 |             current_url = computer.get_current_url()
48 |             call_output["output"]["current_url"] = current_url
49 |             check_blocklisted_url(current_url)
50 | 
51 |         return [call_output]
52 | 
53 |     return []
54 | 
55 | 
56 | def main():
57 |     """Run the CUA (Computer Use Assistant) loop, using Local Playwright."""
58 |     with LocalPlaywrightComputer() as computer:
59 |         tools = [
60 |             {
61 |                 "type": "computer-preview",
62 |                 "display_width": computer.dimensions[0],
63 |                 "display_height": computer.dimensions[1],
64 |                 "environment": computer.environment,
65 |             }
66 |         ]
67 | 
68 |         items = []
69 |         while True:  # get user input forever
70 |             user_input = input("> ")
71 |             items.append({"role": "user", "content": user_input})
72 | 
73 |             while True:  # keep looping until we get a final response
74 |                 response = create_response(
75 |                     model="computer-use-preview",
76 |                     input=items,
77 |                     tools=tools,
78 |                     truncation="auto",
79 |                 )
80 | 
81 |                 if "output" not in response:
82 |                     print(response)
83 |                     raise ValueError("No output from model")
84 | 
85 |                 items += response["output"]
86 | 
87 |                 for item in response["output"]:
88 |                     items += handle_item(item, computer)
89 | 
90 |                 if items[-1].get("role") == "assistant":
91 |                     break
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/evaluator/eval_game/eval_gamecafe.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | from eval_game.eval_utils import run_milestone
  4 | 
  5 | def parse_continue_flag(msg: str) -> str:
  6 |     if "Continue: True" in msg:
  7 |         return "continue"
  8 |     if "Continue: Final" in msg:
  9 |         return "final"
 10 |     return "stop"
 11 | 
 12 | def extract_result(conversation: list[str]) -> str | None:
 13 |     for msg in reversed(conversation):
 14 |         if "Result:" in msg:
 15 |             return msg
 16 |     return None
 17 | 
 18 | async def eval_gamecafe():
 19 |     game = "Game Cafe Escape"
 20 | 
 21 |     # Prompt Load
 22 |     with open("milestone_prompts.json", "r") as f:
 23 |         all_data = json.load(f)
 24 |     
 25 |     data = all_data.get(game, {})
 26 |     if not data:
 27 |         print(f"❌ No data found for game: {game}")
 28 |         return
 29 | 
 30 |     print(f"🎮 Starting evaluation for: {game}")
 31 | 
 32 |     # === milestone_prompt1 ===
 33 |     prompt_key = "milestone_prompt1"
 34 |     if prompt_key in data:
 35 |         print(f"\n🏁 {game} - {prompt_key}")
 36 |         conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 37 |         result = extract_result(conversation)
 38 |         print("🔍 Result:", result)
 39 | 
 40 |         status = parse_continue_flag(result or "")
 41 |         if status != "continue":
 42 |             print("🛑 Evaluation stopped after milestone 1")
 43 |             return
 44 | 
 45 |     # === milestone_prompt2 ===
 46 |     prompt_key = "milestone_prompt2"
 47 |     if prompt_key in data:
 48 |         print(f"\n🏁 {game} - {prompt_key}")
 49 |         conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 50 |         result = extract_result(conversation)
 51 |         print("🔍 Result:", result)
 52 | 
 53 |         status = parse_continue_flag(result or "")
 54 |         if status != "continue":
 55 |             print("🛑 Evaluation stopped after milestone 2")
 56 |             return
 57 | 
 58 |     # === milestone_prompt3 ===
 59 |     prompt_key = "milestone_prompt3"
 60 |     if prompt_key in data:
 61 |         print(f"\n🏁 {game} - {prompt_key}")
 62 |         conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 63 |         result = extract_result(conversation)
 64 |         print("🔍 Result:", result)
 65 | 
 66 |         status = parse_continue_flag(result or "")
 67 |         if status != "continue":
 68 |             print("🛑 Evaluation stopped after milestone 3")
 69 |             return
 70 | 
 71 |     # === milestone_prompt4 ===
 72 |     prompt_key = "milestone_prompt4"
 73 |     if prompt_key in data:
 74 |         print(f"\n🏁 {game} - {prompt_key}")
 75 |         conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 76 |         result = extract_result(conversation)
 77 |         print("🔍 Result:", result)
 78 | 
 79 |         status = parse_continue_flag(result or "")
 80 |         if status != "continue":
 81 |             print("🛑 Evaluation stopped after milestone 4")
 82 |             return
 83 | 
 84 |     # === milestone_prompt5 ===
 85 |     prompt_key = "milestone_prompt5"
 86 |     if prompt_key in data:
 87 |         print(f"\n🏁 {game} - {prompt_key}")
 88 |         conversation = await run_milestone(data[prompt_key], f"{game}_{prompt_key}")
 89 |         result = extract_result(conversation)
 90 |         print("🔍 Result:", result)
 91 | 
 92 |         status = parse_continue_flag(result or "")
 93 |         if status != "continue":
 94 |             print("🏁 Final milestone reached or evaluation ends.")
 95 |             return
 96 | 
 97 |     print("✅ All available milestones completed.")
 98 | 
 99 | if __name__ == "__main__":
100 |     asyncio.run(eval_gamecafe())


--------------------------------------------------------------------------------